su2code · areenraj · Jun 10, 2025 · Jun 10, 2025 · Jun 25, 2025 · Jun 25, 2025
diff --git a/Common/include/CConfig.hpp b/Common/include/CConfig.hpp
@@ -521,6 +521,7 @@ class CConfig {
   Kind_Gradient_Method_Recon,      /*!< \brief Numerical method for computation of spatial gradients used for upwind reconstruction. */
   Kind_Deform_Linear_Solver,             /*!< Numerical method to deform the grid */
   Kind_Deform_Linear_Solver_Prec,        /*!< \brief Preconditioner of the linear solver. */
+  Kind_Graph_Part_Algo,    /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
-  Kind_Graph_Part_Algo,    /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
+  Kind_Graph_Part_Algo,              /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
-  Kind_Graph_Part_Algo,    /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
+  Kind_Graph_Part_Algo,              /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
   Kind_Linear_Solver,                    /*!< \brief Numerical solver for the implicit scheme. */
   Kind_Linear_Solver_Prec,               /*!< \brief Preconditioner of the linear solver. */
   Kind_DiscAdj_Linear_Solver,            /*!< \brief Linear solver for the discrete adjoint system. */
@@ -634,6 +635,8 @@ class CConfig {
   unsigned long Linear_Solver_Restart_Frequency; /*!< \brief Restart frequency of the linear solver for the implicit formulation. */
   unsigned long Linear_Solver_Prec_Threads;      /*!< \brief Number of threads per rank for ILU and LU_SGS preconditioners. */
   unsigned short Linear_Solver_ILU_n;            /*!< \brief ILU fill=in level. */
+  unsigned short Cuda_Block_Size;                /*!< \brief  User-specified value for the X-Axis dimension of thread blocks
+                                                              that are deployed by the CUDA Kernels. */
   su2double SemiSpan;                   /*!< \brief Wing Semi span. */
   su2double Roe_Kappa;                  /*!< \brief Relaxation of the Roe scheme. */
   su2double Relaxation_Factor_Adjoint;  /*!< \brief Relaxation coefficient for variable updates of adjoint solvers. */
@@ -4160,6 +4163,12 @@ class CConfig {
    */
   bool GetLeastSquaresRequired(void) const { return LeastSquaresRequired; }
 
+    /*!
+   * \brief Get the type of algorithm used for partitioning the matrix graph.
+   * \return Algorithm that divides the matrix into partitions that are executed parallely.
+   */
+  unsigned short GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }
+
   /*!
    * \brief Get the kind of solver for the implicit solver.
    * \return Numerical solver for implicit formulation (solving the linear system).
@@ -4221,6 +4230,18 @@ class CConfig {
    */
   su2double GetLinear_Solver_Smoother_Relaxation(void) const { return Linear_Solver_Smoother_Relaxation; }
 
+  /*!
+   * \brief Get thread block dimensions (X-axis) being used by the CUDA Kernels.
+   * \return Thread block dimensions (X-axis) being used by the CUDA Kernels.
+   */
+  unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; }
+
+  /*!
+   * \brief Get the number of matrix rows assigned per CUDA Block.
+   * \return The number of matrix rows assigned per CUDA Block.
+   */
+  unsigned short GetRows_Per_Cuda_Block(void) const { return cudaKernelParameters::rounded_up_division(cudaKernelParameters::CUDA_WARP_SIZE, Cuda_Block_Size); }
+
   /*!
    * \brief Get the relaxation factor for solution updates of adjoint solvers.
    */

diff --git a/Common/include/geometry/CGeometry.hpp b/Common/include/geometry/CGeometry.hpp
@@ -260,6 +260,13 @@ class CGeometry {
   unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
                                                on all prior ranks in the linear partitioning. */
 
+  unsigned long nPartition;       /*!< \brief Number of divisions of the matrix graph during execution of parallel
+                                     partitioning algorithms. */
+  unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
+  vector<unsigned long>
+      partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */
+  vector<unsigned long> chainPtr; /*!< \brief Vector array containing distribution of levels into chains. */
+
   /*--- Data structures for point-to-point MPI communications. ---*/
 
   int maxCountPerPoint{0}; /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */

diff --git a/Common/include/geometry/CPhysicalGeometry.hpp b/Common/include/geometry/CPhysicalGeometry.hpp
@@ -148,6 +148,14 @@ class CPhysicalGeometry final : public CGeometry {
    */
   void DistributeColoring(const CConfig* config, CGeometry* geometry);
 
+  /*!
+   * \brief Divide the graph produced by the matrix into parallel partitions.
+   * \param[in] config - Definition of the particular problem.
+   * \param[in] pointList - Ordered list of points in the mesh.
+   */
+  template <class ScalarType>
+  void PartitionGraph(const CConfig* config, vector<ScalarType>& pointList);
+
   /*!
    * \brief Distribute the grid points, including ghost points, across all ranks based on a ParMETIS coloring.
    * \param[in] config - Definition of the particular problem.

diff --git a/Common/include/linear_algebra/CGraphPartitioning.hpp b/Common/include/linear_algebra/CGraphPartitioning.hpp
@@ -0,0 +1,182 @@
+/*!
+ * \file CGraphPartitioning.hpp
+ * \brief Headers for the classes realted to the algorithms that are used
+                to divide the matrix acyclic graph into parallel partitions.
+ * \author A. Raj
+ * \version 8.2.0 "Harrier"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../geometry/CGeometry.hpp"
+
+/*!
+ * \class CGraphPartitioning
+ * \brief Abstract base class for defining graph partitioning algorithms.
+ * \author A. Raj
+ *
+ * In order to use certain parallel algorithms in the solution process -
+ * whether with linear solvers or preconditioners - we require the matrix
+ * to be partitioned into certain parallel divisions. These maybe in the form
+ * of levels, blocks, colors and so on. Since a number of different algorithms
+ * can be used to split the graph, we've introduced a base class containing the
+ * "Partition" member function from which child classes of the specific
+ * algorithm can be derived. Currently, we are only using direct declarations
+ * of the derived classes in the code. However, this method was chosen as it
+ * allows us to pass different child class algorithms to a single implementation
+ * of the function that requires it - similar to the CMatrixVectorProduct class.
+ */
+
+template <class ScalarType>
+
+class CGraphPartitioning {
+ public:
+  virtual ~CGraphPartitioning() = 0;
+  virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets,
+                         vector<ScalarType>& chainPtr, unsigned short chainLimit) = 0;
+};
+template <class ScalarType>
+CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}
+
+template <class ScalarType>
+
-
-
+class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
+ private:
+  ScalarType nPointDomain;
+  CPoint* nodes;
+
+ public:
+  ScalarType nLevels;
+  ScalarType maxLevelWidth;
+  vector<ScalarType> levels;
+
+  /*!
+   * \brief constructor of the class
+   * \param[in] nPointDomain_ref - number of points associated with the problem
+   * \param[in] nodes_ref - represents the relationships between the points
+   */
+  inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
-  inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
+  CLevelScheduling(ScalarType nPointDomain_ref, CPoint* nodes_ref)
-  inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
+  CLevelScheduling(ScalarType nPointDomain_ref, CPoint* nodes_ref)
+      : nPointDomain(nPointDomain_ref), nodes(nodes_ref) {
+    nLevels = 0ul;
+    maxLevelWidth = 0ul;
+  }
+
+  CLevelScheduling() = delete;  // Removing default constructor
+
-  CLevelScheduling() = delete;  // Removing default constructor
-  CLevelScheduling() = delete;  // Removing default constructor
+  /*!
+   * \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
+   * \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
+   * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
+   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
+   */
+  void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
+    ScalarType levelWidth = 0;
+
+    /*This is not a magic number. We are simply initializing
+    the point array with its first element that is always zero.*/
+    chainPtr.push_back(0);
+
+    for (ScalarType iLevel = 0ul; iLevel < nLevels; iLevel++) {
+      levelWidth = levelOffsets[iLevel + 1] - levelOffsets[iLevel];
+      maxLevelWidth = std::max(levelWidth, maxLevelWidth);
+
+      if (levelWidth > rowsPerBlock) {
+        if (chainPtr.back() != iLevel) {
+          chainPtr.push_back(iLevel);
+        }
+
+        chainPtr.push_back(iLevel + 1);
+      }
+    }
+
+    chainPtr.push_back(nLevels);
+  }
+
+  /*!
+   * \brief Reorders the points according to the levels
+   * \param[in] pointList - Ordered array that contains the list of all mesh points.
+   * \param[in] inversePointList - Array utilized to access the index of each point in pointList.
+   * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
+   */
+  void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets) {
+    for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
+      const auto globalPoint = pointList[localPoint];
+      inversePointList[levelOffsets[levels[localPoint]]++] = globalPoint;
+    }
+
+    pointList = std::move(inversePointList);
+  }
+
+  /*!
+   * \brief Reorders the points according to the levels
+   * \param[in] pointList - Ordered array that contains the list  of all mesh points.
+   * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
+   * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
+   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
+   */
+  void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr,
+                 unsigned short rowsPerBlock) override {
+    vector<ScalarType> inversePointList;
+    inversePointList.reserve(nPointDomain);
+    levels.reserve(nPointDomain);
+
+    for (auto point = 0ul; point < nPointDomain; point++) {
+      inversePointList[pointList[point]] = point;
+      levels[point] = 0;
+    }
+
+    //  Local Point - Ordering of the points post the RCM ordering
+    //  Global Point - Original order of the points before the RCM ordering
+
+    for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
+      const auto globalPoint = pointList[localPoint];
+
+      for (auto adjPoints = 0u; adjPoints < nodes->GetnPoint(globalPoint); adjPoints++) {
+        const auto adjGlobalPoint = nodes->GetPoint(globalPoint, adjPoints);
+
+        if (adjGlobalPoint < nPointDomain) {
+          const auto adjLocalPoint = inversePointList[adjGlobalPoint];
+
+          if (adjLocalPoint < localPoint) {
+            levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
+          }
+        }
+      }
+
+      nLevels = std::max(nLevels, levels[localPoint] + 1);
+    }
+
+    levelOffsets.resize(nLevels + 1);
+    for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) {
+      ++levelOffsets[levels[iPoint] + 1];
+    }
+
+    for (auto iLevel = 2ul; iLevel <= nLevels; ++iLevel) {
+      levelOffsets[iLevel] += levelOffsets[iLevel - 1];
+    }
+
+    Reorder(pointList, inversePointList, levelOffsets);
+
+    CalculateChain(levelOffsets, chainPtr, rowsPerBlock);
+  }
+};
diff --git a/Common/include/linear_algebra/CPreconditioner.hpp b/Common/include/linear_algebra/CPreconditioner.hpp
@@ -205,7 +205,15 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
    * \param[out] v - CSysVector that is the result of the preconditioning.
    */
   inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
+#ifdef HAVE_CUDA
+    if (config->GetCUDA()) {
+      sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);
+    } else {
+      sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
+    }
+#else
     sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
+#endif
-    } else {
-      sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
-    }
-#else
-    sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
-#endif
+      return;
+    }
+#endif
+    sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
-    } else {
-      sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
-    }
-#else
-    sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
-#endif
+      return;
+    }
+#endif
+    sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
   }
 };
 

diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -148,8 +148,10 @@ class CSysMatrix {
   ScalarType* d_matrix;           /*!< \brief Device Pointer to store the matrix values on the GPU. */
   const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
-  bool useCuda;                   /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
-                                     Mainly used to conditionally free GPU memory in the class destructor. */
+  const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
+  unsigned long* d_partition_offsets;
+  bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
+                   Mainly used to conditionally free GPU memory in the class destructor. */
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -904,8 +906,8 @@ class CSysMatrix {
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
    * \param[out] prod - Result of the product A*vec.
    */
-  void GPUComputeLU_SGSPreconditioner(ScalarType& vec, ScalarType& prod, CGeometry* geometry,
-                                      const CConfig* config) const;
+  void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
+                                      CGeometry* geometry, const CConfig* config) const;
 
   /*!
    * \brief Build the Jacobi preconditioner.

diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
@@ -25,16 +25,72 @@
 * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
 */
 
+#pragma once
+
 #include<cuda_runtime.h>
 #include<iostream>
+#include "../option_structure.hpp"
+
+/*!
+ * \struct matrixParameters
+ * \brief Structure containing information related to the Jacobian Matrix which is utilized by any launched Kernel.
+ *
+ *  This implementation alleviates the need to pass an excessive number of arguments
+ *  to a Kernel and, instead, packages it into a single structure. While this leads
+ *  to data duplication for a short period of time, this is a much cleaner and resuable approach.
+ * \author A. Raj
+ */
+struct matrixParameters{
-struct matrixParameters{
+struct MatrixParameters{
-struct matrixParameters{
+struct MatrixParameters{
 
-namespace KernelParameters{
+  public:
+    unsigned long totalRows;        /*!< \brief Contains the total number of rows of the Jacbian Matrix. */
+    unsigned short blockRowSize;    /*!< \brief Contains the row dimensions of the blocks of the Jacobian Matrix. */
+    unsigned short blockColSize;    /*!< \brief Contains the column dimensions of the blocks of the Jacobian Matrix. */
+    unsigned int nChainStart;       /*!< \brief Starting partition of the current chain. */
+    unsigned int nChainEnd;         /*!< \brief Ending partition of the current chain. */
+    unsigned short blockSize;       /*!< \brief Contains the total number of elements in each block of the Jacbian Matrix. */
+    unsigned short activeThreads;   /*!< \brief Cotains the number of active threads per iteration during MVP - depending on the
+                                        dimensions of the Jacbian Matrix. */
+    unsigned short rowsPerBlock;     /*!< \brief Number of rows being processed by each thread block. This is equal to the number
+                                        of warps present in the block as each row gets assigned a warp. */
 
-inline constexpr int round_up_division(const int multiple, int x) { return ((x + multiple - 1) / multiple); }
+    matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions, unsigned short rowsPrBlck){
+      totalRows = nPointDomain;
+      blockRowSize = nEqn;
+      blockColSize = nVar;
+      nChainStart = 0;
+      nChainEnd = 0;
+      blockSize = nVar * nEqn;
+      activeThreads = nVar * (cudaKernelParameters::CUDA_WARP_SIZE/nVar);
+      rowsPerBlock = rowsPrBlck;
+    }
+
+    /*!
+    * \brief Returns the memory index in the shared memory array used by the Symmetric Iteration Kernels.
+    */
+    __device__ __forceinline__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
+      return (localRow * blockSize + threadNo);
+    }
+
+    /*!
+    * \brief Returns a boolean value to check whether the row is under the total number of rows and if the
+    *        thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
+    */
+    __device__ __forceinline__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
+      return (row<totalRows && threadNo<threadLimit);
+    }
+
+    /*!
+    * \brief Returns a boolean value to check whether the row is part of the parallel partition being executed and if the
+    *        thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
+    * \param[in] rowInPartition - Represents a boolean that indicates the presence/absence of the row in the partition.
+    */
+    __device__ __forceinline__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
+      return (rowInPartition && threadNo<threadLimit);
+    }
+
+};
 
-static constexpr int MVP_BLOCK_SIZE = 1024;
-static constexpr int MVP_WARP_SIZE = 32;
-}
 /*!
 * \brief assert style function that reads return codes after intercepting CUDA API calls.
 *        It returns the result code and its location if the call is unsuccessful.