Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
79c9e52
updating branch
areenraj Jun 10, 2025
6686171
updating branch
areenraj Jun 10, 2025
b7591f4
Major commit: graph partitioning algorithms, level scheduling method,…
areenraj Jun 25, 2025
6f51f45
Major commit: graph partitioning algorithms, level scheduling method,…
areenraj Jun 25, 2025
cdf0225
resolving conflicts
areenraj Jun 25, 2025
9dd3028
resolving conflicts
areenraj Jun 25, 2025
2495edd
resolving some more conflicts
areenraj Jun 25, 2025
f68358e
cleaning up
areenraj Jun 25, 2025
bf561b6
apologies for repeated commits, just cleaning
areenraj Jun 25, 2025
9121e25
coalesced memory access for MVP, shared memory addition and lamda fun…
areenraj Jun 29, 2025
991e29f
bug fixes
areenraj Jul 1, 2025
9bfeff9
Merge remote-tracking branch 'upstream/master'
areenraj Jul 3, 2025
0372099
Working GPU LU_SGS Preconditioner Port
areenraj Jul 15, 2025
52b90b6
Fixed the issue with the visibility of the rowsPerBlock variable. Als…
areenraj Jul 17, 2025
d367627
Working LU_SGS Preconditioner with graph partitioned algorithms, upda…
areenraj Jul 17, 2025
661c9b8
LU_SGS Preconditioner Port
areenraj Jul 17, 2025
b5cf7dd
Merge branch 'master' of https://github.com/areenraj/SU2_GSoC_GPU
areenraj Jul 17, 2025
c4dbe5c
Fixing warnings
areenraj Jul 17, 2025
b3d2fbf
Merge branch 'develop' of https://github.com/su2code/SU2
areenraj Jul 17, 2025
1be1e2f
Syncing repo to develop
areenraj Jul 17, 2025
7472bd1
updating submodule versions
areenraj Jul 17, 2025
352e148
Fixing some more warnings
areenraj Jul 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions Common/include/CConfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,7 @@ class CConfig {
Kind_Gradient_Method_Recon, /*!< \brief Numerical method for computation of spatial gradients used for upwind reconstruction. */
Kind_Deform_Linear_Solver, /*!< Numerical method to deform the grid */
Kind_Deform_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */
Kind_Graph_Part_Algo, /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Align with other comments

Suggested change
Kind_Graph_Part_Algo, /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
Kind_Graph_Part_Algo, /*!< \brief Algorithm for parallel partitioning of the matrix graph. */

Kind_Linear_Solver, /*!< \brief Numerical solver for the implicit scheme. */
Kind_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */
Kind_DiscAdj_Linear_Solver, /*!< \brief Linear solver for the discrete adjoint system. */
Expand Down Expand Up @@ -634,6 +635,8 @@ class CConfig {
unsigned long Linear_Solver_Restart_Frequency; /*!< \brief Restart frequency of the linear solver for the implicit formulation. */
unsigned long Linear_Solver_Prec_Threads; /*!< \brief Number of threads per rank for ILU and LU_SGS preconditioners. */
unsigned short Linear_Solver_ILU_n; /*!< \brief ILU fill=in level. */
unsigned short Cuda_Block_Size; /*!< \brief User-specified value for the X-Axis dimension of thread blocks
that are deployed by the CUDA Kernels. */
su2double SemiSpan; /*!< \brief Wing Semi span. */
su2double Roe_Kappa; /*!< \brief Relaxation of the Roe scheme. */
su2double Relaxation_Factor_Adjoint; /*!< \brief Relaxation coefficient for variable updates of adjoint solvers. */
Expand Down Expand Up @@ -4160,6 +4163,12 @@ class CConfig {
*/
bool GetLeastSquaresRequired(void) const { return LeastSquaresRequired; }

/*!
* \brief Get the type of algorithm used for partitioning the matrix graph.
* \return Algorithm that divides the matrix into partitions that are executed parallely.
*/
unsigned short GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }

/*!
* \brief Get the kind of solver for the implicit solver.
* \return Numerical solver for implicit formulation (solving the linear system).
Expand Down Expand Up @@ -4221,6 +4230,18 @@ class CConfig {
*/
su2double GetLinear_Solver_Smoother_Relaxation(void) const { return Linear_Solver_Smoother_Relaxation; }

/*!
* \brief Get thread block dimensions (X-axis) being used by the CUDA Kernels.
* \return Thread block dimensions (X-axis) being used by the CUDA Kernels.
*/
unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; }

/*!
* \brief Get the number of matrix rows assigned per CUDA Block.
* \return The number of matrix rows assigned per CUDA Block.
*/
unsigned short GetRows_Per_Cuda_Block(void) const { return cudaKernelParameters::rounded_up_division(cudaKernelParameters::CUDA_WARP_SIZE, Cuda_Block_Size); }

Comment on lines +4239 to +4244
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move somewhere in linear algebra since it's not really a config option

/*!
* \brief Get the relaxation factor for solution updates of adjoint solvers.
*/
Expand Down
7 changes: 7 additions & 0 deletions Common/include/geometry/CGeometry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,13 @@ class CGeometry {
unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
on all prior ranks in the linear partitioning. */

unsigned long nPartition; /*!< \brief Number of divisions of the matrix graph during execution of parallel
partitioning algorithms. */
Comment on lines +263 to +264
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's call it levels or colors, partition is associated with MPI parallelization

unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
vector<unsigned long>
partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */
vector<unsigned long> chainPtr; /*!< \brief Vector array containing distribution of levels into chains. */

/*--- Data structures for point-to-point MPI communications. ---*/

int maxCountPerPoint{0}; /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */
Expand Down
8 changes: 8 additions & 0 deletions Common/include/geometry/CPhysicalGeometry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,14 @@ class CPhysicalGeometry final : public CGeometry {
*/
void DistributeColoring(const CConfig* config, CGeometry* geometry);

/*!
* \brief Divide the graph produced by the matrix into parallel partitions.
* \param[in] config - Definition of the particular problem.
* \param[in] pointList - Ordered list of points in the mesh.
*/
template <class ScalarType>
void PartitionGraph(const CConfig* config, vector<ScalarType>& pointList);

/*!
* \brief Distribute the grid points, including ghost points, across all ranks based on a ParMETIS coloring.
* \param[in] config - Definition of the particular problem.
Expand Down
182 changes: 182 additions & 0 deletions Common/include/linear_algebra/CGraphPartitioning.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
/*!
* \file CGraphPartitioning.hpp
* \brief Headers for the classes realted to the algorithms that are used
to divide the matrix acyclic graph into parallel partitions.
* \author A. Raj
* \version 8.2.0 "Harrier"
*
* SU2 Project Website: https://su2code.github.io
*
* The SU2 Project is maintained by the SU2 Foundation
* (http://su2foundation.org)
*
* Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md)
*
* SU2 is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* SU2 is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include "../geometry/CGeometry.hpp"

/*!
* \class CGraphPartitioning
* \brief Abstract base class for defining graph partitioning algorithms.
* \author A. Raj
*
* In order to use certain parallel algorithms in the solution process -
* whether with linear solvers or preconditioners - we require the matrix
* to be partitioned into certain parallel divisions. These maybe in the form
* of levels, blocks, colors and so on. Since a number of different algorithms
* can be used to split the graph, we've introduced a base class containing the
* "Partition" member function from which child classes of the specific
* algorithm can be derived. Currently, we are only using direct declarations
* of the derived classes in the code. However, this method was chosen as it
* allows us to pass different child class algorithms to a single implementation
* of the function that requires it - similar to the CMatrixVectorProduct class.
*/

template <class ScalarType>

class CGraphPartitioning {
public:
virtual ~CGraphPartitioning() = 0;
virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets,
vector<ScalarType>& chainPtr, unsigned short chainLimit) = 0;
};
template <class ScalarType>
CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}

template <class ScalarType>

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
private:
ScalarType nPointDomain;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Scalar is usually a floating point but here it's an integer, rename to IndexType or something similar.
See what I have in graph_toolbox.h

CPoint* nodes;

public:
ScalarType nLevels;
ScalarType maxLevelWidth;
vector<ScalarType> levels;

/*!
* \brief constructor of the class
* \param[in] nPointDomain_ref - number of points associated with the problem
* \param[in] nodes_ref - represents the relationships between the points
*/
inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
CLevelScheduling(ScalarType nPointDomain_ref, CPoint* nodes_ref)

: nPointDomain(nPointDomain_ref), nodes(nodes_ref) {
nLevels = 0ul;
maxLevelWidth = 0ul;
}

CLevelScheduling() = delete; // Removing default constructor

Comment on lines +84 to +85
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's already deleted because you have defined a constructor

Suggested change
CLevelScheduling() = delete; // Removing default constructor

/*!
* \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
* \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
*/
void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
ScalarType levelWidth = 0;

/*This is not a magic number. We are simply initializing
the point array with its first element that is always zero.*/
chainPtr.push_back(0);

for (ScalarType iLevel = 0ul; iLevel < nLevels; iLevel++) {
levelWidth = levelOffsets[iLevel + 1] - levelOffsets[iLevel];
maxLevelWidth = std::max(levelWidth, maxLevelWidth);

if (levelWidth > rowsPerBlock) {
if (chainPtr.back() != iLevel) {
chainPtr.push_back(iLevel);
}

chainPtr.push_back(iLevel + 1);
}
}

chainPtr.push_back(nLevels);
}

/*!
* \brief Reorders the points according to the levels
* \param[in] pointList - Ordered array that contains the list of all mesh points.
* \param[in] inversePointList - Array utilized to access the index of each point in pointList.
* \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
*/
void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets) {
Comment on lines +117 to +121
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

point list is an output, put the function signature, inputs should come before outputs

for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
const auto globalPoint = pointList[localPoint];
inversePointList[levelOffsets[levels[localPoint]]++] = globalPoint;
}

pointList = std::move(inversePointList);
}

/*!
* \brief Reorders the points according to the levels
* \param[in] pointList - Ordered array that contains the list of all mesh points.
* \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
Comment on lines +132 to +135
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inputs should be const&, if they can't be const, they are either in,out or out

*/
void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr,
unsigned short rowsPerBlock) override {
vector<ScalarType> inversePointList;
inversePointList.reserve(nPointDomain);
levels.reserve(nPointDomain);

for (auto point = 0ul; point < nPointDomain; point++) {
inversePointList[pointList[point]] = point;
levels[point] = 0;
}

// Local Point - Ordering of the points post the RCM ordering
// Global Point - Original order of the points before the RCM ordering

for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
const auto globalPoint = pointList[localPoint];

for (auto adjPoints = 0u; adjPoints < nodes->GetnPoint(globalPoint); adjPoints++) {
const auto adjGlobalPoint = nodes->GetPoint(globalPoint, adjPoints);

if (adjGlobalPoint < nPointDomain) {
const auto adjLocalPoint = inversePointList[adjGlobalPoint];

if (adjLocalPoint < localPoint) {
levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
}
}
}

nLevels = std::max(nLevels, levels[localPoint] + 1);
}

levelOffsets.resize(nLevels + 1);
for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) {
++levelOffsets[levels[iPoint] + 1];
}

for (auto iLevel = 2ul; iLevel <= nLevels; ++iLevel) {
levelOffsets[iLevel] += levelOffsets[iLevel - 1];
}

Reorder(pointList, inversePointList, levelOffsets);

CalculateChain(levelOffsets, chainPtr, rowsPerBlock);
}
};
8 changes: 8 additions & 0 deletions Common/include/linear_algebra/CPreconditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,15 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
* \param[out] v - CSysVector that is the result of the preconditioning.
*/
inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
#ifdef HAVE_CUDA
if (config->GetCUDA()) {
sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);
} else {
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
}
#else
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
#endif
Comment on lines +211 to +216
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
} else {
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
}
#else
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
#endif
return;
}
#endif
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);

}
};

Expand Down
10 changes: 6 additions & 4 deletions Common/include/linear_algebra/CSysMatrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,10 @@ class CSysMatrix {
ScalarType* d_matrix; /*!< \brief Device Pointer to store the matrix values on the GPU. */
const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
Mainly used to conditionally free GPU memory in the class destructor. */
const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
unsigned long* d_partition_offsets;
bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
Mainly used to conditionally free GPU memory in the class destructor. */

ScalarType* ILU_matrix; /*!< \brief Entries of the ILU sparse matrix. */
unsigned long nnz_ilu; /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
Expand Down Expand Up @@ -904,8 +906,8 @@ class CSysMatrix {
* \param[in] vec - CSysVector to be multiplied by the preconditioner.
* \param[out] prod - Result of the product A*vec.
*/
void GPUComputeLU_SGSPreconditioner(ScalarType& vec, ScalarType& prod, CGeometry* geometry,
const CConfig* config) const;
void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
CGeometry* geometry, const CConfig* config) const;

/*!
* \brief Build the Jacobi preconditioner.
Expand Down
66 changes: 61 additions & 5 deletions Common/include/linear_algebra/GPUComms.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,72 @@
* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
*/

#pragma once

#include<cuda_runtime.h>
#include<iostream>
#include "../option_structure.hpp"

/*!
* \struct matrixParameters
* \brief Structure containing information related to the Jacobian Matrix which is utilized by any launched Kernel.
*
* This implementation alleviates the need to pass an excessive number of arguments
* to a Kernel and, instead, packages it into a single structure. While this leads
* to data duplication for a short period of time, this is a much cleaner and resuable approach.
* \author A. Raj
*/
struct matrixParameters{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Upper case for classes, structs etc

Suggested change
struct matrixParameters{
struct MatrixParameters{


namespace KernelParameters{
public:
unsigned long totalRows; /*!< \brief Contains the total number of rows of the Jacbian Matrix. */
unsigned short blockRowSize; /*!< \brief Contains the row dimensions of the blocks of the Jacobian Matrix. */
unsigned short blockColSize; /*!< \brief Contains the column dimensions of the blocks of the Jacobian Matrix. */
unsigned int nChainStart; /*!< \brief Starting partition of the current chain. */
unsigned int nChainEnd; /*!< \brief Ending partition of the current chain. */
unsigned short blockSize; /*!< \brief Contains the total number of elements in each block of the Jacbian Matrix. */
unsigned short activeThreads; /*!< \brief Cotains the number of active threads per iteration during MVP - depending on the
dimensions of the Jacbian Matrix. */
unsigned short rowsPerBlock; /*!< \brief Number of rows being processed by each thread block. This is equal to the number
of warps present in the block as each row gets assigned a warp. */

inline constexpr int round_up_division(const int multiple, int x) { return ((x + multiple - 1) / multiple); }
matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions, unsigned short rowsPrBlck){
totalRows = nPointDomain;
blockRowSize = nEqn;
blockColSize = nVar;
nChainStart = 0;
nChainEnd = 0;
blockSize = nVar * nEqn;
activeThreads = nVar * (cudaKernelParameters::CUDA_WARP_SIZE/nVar);
rowsPerBlock = rowsPrBlck;
}

/*!
* \brief Returns the memory index in the shared memory array used by the Symmetric Iteration Kernels.
*/
__device__ __forceinline__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
return (localRow * blockSize + threadNo);
}

/*!
* \brief Returns a boolean value to check whether the row is under the total number of rows and if the
* thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
*/
__device__ __forceinline__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
return (row<totalRows && threadNo<threadLimit);
}

/*!
* \brief Returns a boolean value to check whether the row is part of the parallel partition being executed and if the
* thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
* \param[in] rowInPartition - Represents a boolean that indicates the presence/absence of the row in the partition.
*/
__device__ __forceinline__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
return (rowInPartition && threadNo<threadLimit);
}

};

static constexpr int MVP_BLOCK_SIZE = 1024;
static constexpr int MVP_WARP_SIZE = 32;
}
/*!
* \brief assert style function that reads return codes after intercepting CUDA API calls.
* It returns the result code and its location if the call is unsuccessful.
Expand Down
Loading