-
Notifications
You must be signed in to change notification settings - Fork 904
LU_SGS Preconditioner GPU Port #2539
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
79c9e52
6686171
b7591f4
6f51f45
cdf0225
9dd3028
2495edd
f68358e
bf561b6
9121e25
991e29f
9bfeff9
0372099
52b90b6
d367627
661c9b8
b5cf7dd
c4dbe5c
b3d2fbf
1be1e2f
7472bd1
352e148
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -521,6 +521,7 @@ class CConfig { | |
| Kind_Gradient_Method_Recon, /*!< \brief Numerical method for computation of spatial gradients used for upwind reconstruction. */ | ||
| Kind_Deform_Linear_Solver, /*!< Numerical method to deform the grid */ | ||
| Kind_Deform_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */ | ||
| Kind_Graph_Part_Algo, /*!< \brief Algorithm for parallel partitioning of the matrix graph. */ | ||
| Kind_Linear_Solver, /*!< \brief Numerical solver for the implicit scheme. */ | ||
| Kind_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */ | ||
| Kind_DiscAdj_Linear_Solver, /*!< \brief Linear solver for the discrete adjoint system. */ | ||
|
|
@@ -634,6 +635,8 @@ class CConfig { | |
| unsigned long Linear_Solver_Restart_Frequency; /*!< \brief Restart frequency of the linear solver for the implicit formulation. */ | ||
| unsigned long Linear_Solver_Prec_Threads; /*!< \brief Number of threads per rank for ILU and LU_SGS preconditioners. */ | ||
| unsigned short Linear_Solver_ILU_n; /*!< \brief ILU fill=in level. */ | ||
| unsigned short Cuda_Block_Size; /*!< \brief User-specified value for the X-Axis dimension of thread blocks | ||
| that are deployed by the CUDA Kernels. */ | ||
| su2double SemiSpan; /*!< \brief Wing Semi span. */ | ||
| su2double Roe_Kappa; /*!< \brief Relaxation of the Roe scheme. */ | ||
| su2double Relaxation_Factor_Adjoint; /*!< \brief Relaxation coefficient for variable updates of adjoint solvers. */ | ||
|
|
@@ -4160,6 +4163,12 @@ class CConfig { | |
| */ | ||
| bool GetLeastSquaresRequired(void) const { return LeastSquaresRequired; } | ||
|
|
||
| /*! | ||
| * \brief Get the type of algorithm used for partitioning the matrix graph. | ||
| * \return Algorithm that divides the matrix into partitions that are executed parallely. | ||
| */ | ||
| unsigned short GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; } | ||
|
|
||
| /*! | ||
| * \brief Get the kind of solver for the implicit solver. | ||
| * \return Numerical solver for implicit formulation (solving the linear system). | ||
|
|
@@ -4221,6 +4230,18 @@ class CConfig { | |
| */ | ||
| su2double GetLinear_Solver_Smoother_Relaxation(void) const { return Linear_Solver_Smoother_Relaxation; } | ||
|
|
||
| /*! | ||
| * \brief Get thread block dimensions (X-axis) being used by the CUDA Kernels. | ||
| * \return Thread block dimensions (X-axis) being used by the CUDA Kernels. | ||
| */ | ||
| unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; } | ||
|
|
||
| /*! | ||
| * \brief Get the number of matrix rows assigned per CUDA Block. | ||
| * \return The number of matrix rows assigned per CUDA Block. | ||
| */ | ||
| unsigned short GetRows_Per_Cuda_Block(void) const { return cudaKernelParameters::rounded_up_division(cudaKernelParameters::CUDA_WARP_SIZE, Cuda_Block_Size); } | ||
|
|
||
|
Comment on lines
+4239
to
+4244
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move somewhere in linear algebra since it's not really a config option |
||
| /*! | ||
| * \brief Get the relaxation factor for solution updates of adjoint solvers. | ||
| */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -260,6 +260,13 @@ class CGeometry { | |
| unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points | ||
| on all prior ranks in the linear partitioning. */ | ||
|
|
||
| unsigned long nPartition; /*!< \brief Number of divisions of the matrix graph during execution of parallel | ||
| partitioning algorithms. */ | ||
|
Comment on lines
+263
to
+264
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's call it levels or colors, partition is associated with MPI parallelization |
||
| unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */ | ||
| vector<unsigned long> | ||
| partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */ | ||
| vector<unsigned long> chainPtr; /*!< \brief Vector array containing distribution of levels into chains. */ | ||
|
|
||
| /*--- Data structures for point-to-point MPI communications. ---*/ | ||
|
|
||
| int maxCountPerPoint{0}; /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,182 @@ | ||||||
| /*! | ||||||
| * \file CGraphPartitioning.hpp | ||||||
| * \brief Headers for the classes realted to the algorithms that are used | ||||||
| to divide the matrix acyclic graph into parallel partitions. | ||||||
| * \author A. Raj | ||||||
| * \version 8.2.0 "Harrier" | ||||||
| * | ||||||
| * SU2 Project Website: https://su2code.github.io | ||||||
| * | ||||||
| * The SU2 Project is maintained by the SU2 Foundation | ||||||
| * (http://su2foundation.org) | ||||||
| * | ||||||
| * Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md) | ||||||
| * | ||||||
| * SU2 is free software; you can redistribute it and/or | ||||||
| * modify it under the terms of the GNU Lesser General Public | ||||||
| * License as published by the Free Software Foundation; either | ||||||
| * version 2.1 of the License, or (at your option) any later version. | ||||||
| * | ||||||
| * SU2 is distributed in the hope that it will be useful, | ||||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||||||
| * Lesser General Public License for more details. | ||||||
| * | ||||||
| * You should have received a copy of the GNU Lesser General Public | ||||||
| * License along with SU2. If not, see <http://www.gnu.org/licenses/>. | ||||||
| */ | ||||||
|
|
||||||
| #pragma once | ||||||
|
|
||||||
| #include "../geometry/CGeometry.hpp" | ||||||
|
|
||||||
| /*! | ||||||
| * \class CGraphPartitioning | ||||||
| * \brief Abstract base class for defining graph partitioning algorithms. | ||||||
| * \author A. Raj | ||||||
| * | ||||||
| * In order to use certain parallel algorithms in the solution process - | ||||||
| * whether with linear solvers or preconditioners - we require the matrix | ||||||
| * to be partitioned into certain parallel divisions. These maybe in the form | ||||||
| * of levels, blocks, colors and so on. Since a number of different algorithms | ||||||
| * can be used to split the graph, we've introduced a base class containing the | ||||||
| * "Partition" member function from which child classes of the specific | ||||||
| * algorithm can be derived. Currently, we are only using direct declarations | ||||||
| * of the derived classes in the code. However, this method was chosen as it | ||||||
| * allows us to pass different child class algorithms to a single implementation | ||||||
| * of the function that requires it - similar to the CMatrixVectorProduct class. | ||||||
| */ | ||||||
|
|
||||||
| template <class ScalarType> | ||||||
|
|
||||||
| class CGraphPartitioning { | ||||||
| public: | ||||||
| virtual ~CGraphPartitioning() = 0; | ||||||
| virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets, | ||||||
| vector<ScalarType>& chainPtr, unsigned short chainLimit) = 0; | ||||||
| }; | ||||||
| template <class ScalarType> | ||||||
| CGraphPartitioning<ScalarType>::~CGraphPartitioning() {} | ||||||
|
|
||||||
| template <class ScalarType> | ||||||
|
|
||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| class CLevelScheduling final : public CGraphPartitioning<ScalarType> { | ||||||
| private: | ||||||
| ScalarType nPointDomain; | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Scalar is usually a floating point but here it's an integer, rename to IndexType or something similar. |
||||||
| CPoint* nodes; | ||||||
|
|
||||||
| public: | ||||||
| ScalarType nLevels; | ||||||
| ScalarType maxLevelWidth; | ||||||
| vector<ScalarType> levels; | ||||||
|
|
||||||
| /*! | ||||||
| * \brief constructor of the class | ||||||
| * \param[in] nPointDomain_ref - number of points associated with the problem | ||||||
| * \param[in] nodes_ref - represents the relationships between the points | ||||||
| */ | ||||||
| inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref) | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| : nPointDomain(nPointDomain_ref), nodes(nodes_ref) { | ||||||
| nLevels = 0ul; | ||||||
| maxLevelWidth = 0ul; | ||||||
| } | ||||||
|
|
||||||
| CLevelScheduling() = delete; // Removing default constructor | ||||||
|
|
||||||
|
Comment on lines
+84
to
+85
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's already deleted because you have defined a constructor
Suggested change
|
||||||
| /*! | ||||||
| * \brief Divides the levels into groups of chains depending on the preset GPU block and warp size. | ||||||
| * \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level. | ||||||
| * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain. | ||||||
| * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block. | ||||||
| */ | ||||||
| void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) { | ||||||
| ScalarType levelWidth = 0; | ||||||
|
|
||||||
| /*This is not a magic number. We are simply initializing | ||||||
| the point array with its first element that is always zero.*/ | ||||||
| chainPtr.push_back(0); | ||||||
|
|
||||||
| for (ScalarType iLevel = 0ul; iLevel < nLevels; iLevel++) { | ||||||
| levelWidth = levelOffsets[iLevel + 1] - levelOffsets[iLevel]; | ||||||
| maxLevelWidth = std::max(levelWidth, maxLevelWidth); | ||||||
|
|
||||||
| if (levelWidth > rowsPerBlock) { | ||||||
| if (chainPtr.back() != iLevel) { | ||||||
| chainPtr.push_back(iLevel); | ||||||
| } | ||||||
|
|
||||||
| chainPtr.push_back(iLevel + 1); | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| chainPtr.push_back(nLevels); | ||||||
| } | ||||||
|
|
||||||
| /*! | ||||||
| * \brief Reorders the points according to the levels | ||||||
| * \param[in] pointList - Ordered array that contains the list of all mesh points. | ||||||
| * \param[in] inversePointList - Array utilized to access the index of each point in pointList. | ||||||
| * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level. | ||||||
| */ | ||||||
| void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets) { | ||||||
|
Comment on lines
+117
to
+121
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. point list is an output, put the function signature, inputs should come before outputs |
||||||
| for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) { | ||||||
| const auto globalPoint = pointList[localPoint]; | ||||||
| inversePointList[levelOffsets[levels[localPoint]]++] = globalPoint; | ||||||
| } | ||||||
|
|
||||||
| pointList = std::move(inversePointList); | ||||||
| } | ||||||
|
|
||||||
| /*! | ||||||
| * \brief Reorders the points according to the levels | ||||||
| * \param[in] pointList - Ordered array that contains the list of all mesh points. | ||||||
| * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level. | ||||||
| * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain. | ||||||
| * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block. | ||||||
|
Comment on lines
+132
to
+135
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Inputs should be const&, if they can't be const, they are either in,out or out |
||||||
| */ | ||||||
| void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr, | ||||||
| unsigned short rowsPerBlock) override { | ||||||
| vector<ScalarType> inversePointList; | ||||||
| inversePointList.reserve(nPointDomain); | ||||||
| levels.reserve(nPointDomain); | ||||||
|
|
||||||
| for (auto point = 0ul; point < nPointDomain; point++) { | ||||||
| inversePointList[pointList[point]] = point; | ||||||
| levels[point] = 0; | ||||||
| } | ||||||
|
|
||||||
| // Local Point - Ordering of the points post the RCM ordering | ||||||
| // Global Point - Original order of the points before the RCM ordering | ||||||
|
|
||||||
| for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) { | ||||||
| const auto globalPoint = pointList[localPoint]; | ||||||
|
|
||||||
| for (auto adjPoints = 0u; adjPoints < nodes->GetnPoint(globalPoint); adjPoints++) { | ||||||
| const auto adjGlobalPoint = nodes->GetPoint(globalPoint, adjPoints); | ||||||
|
|
||||||
| if (adjGlobalPoint < nPointDomain) { | ||||||
| const auto adjLocalPoint = inversePointList[adjGlobalPoint]; | ||||||
|
|
||||||
| if (adjLocalPoint < localPoint) { | ||||||
| levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1); | ||||||
| } | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| nLevels = std::max(nLevels, levels[localPoint] + 1); | ||||||
| } | ||||||
|
|
||||||
| levelOffsets.resize(nLevels + 1); | ||||||
| for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) { | ||||||
| ++levelOffsets[levels[iPoint] + 1]; | ||||||
| } | ||||||
|
|
||||||
| for (auto iLevel = 2ul; iLevel <= nLevels; ++iLevel) { | ||||||
| levelOffsets[iLevel] += levelOffsets[iLevel - 1]; | ||||||
| } | ||||||
|
|
||||||
| Reorder(pointList, inversePointList, levelOffsets); | ||||||
|
|
||||||
| CalculateChain(levelOffsets, chainPtr, rowsPerBlock); | ||||||
| } | ||||||
| }; | ||||||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -205,7 +205,15 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> { | |||||||||||||||||||||
| * \param[out] v - CSysVector that is the result of the preconditioning. | ||||||||||||||||||||||
| */ | ||||||||||||||||||||||
| inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override { | ||||||||||||||||||||||
| #ifdef HAVE_CUDA | ||||||||||||||||||||||
| if (config->GetCUDA()) { | ||||||||||||||||||||||
| sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config); | ||||||||||||||||||||||
| } else { | ||||||||||||||||||||||
| sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config); | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| #else | ||||||||||||||||||||||
| sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config); | ||||||||||||||||||||||
| #endif | ||||||||||||||||||||||
|
Comment on lines
+211
to
+216
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||
| } | ||||||||||||||||||||||
| }; | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
|
|
||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -25,16 +25,72 @@ | |||||
| * License along with SU2. If not, see <http://www.gnu.org/licenses/>. | ||||||
| */ | ||||||
|
|
||||||
| #pragma once | ||||||
|
|
||||||
| #include<cuda_runtime.h> | ||||||
| #include<iostream> | ||||||
| #include "../option_structure.hpp" | ||||||
|
|
||||||
| /*! | ||||||
| * \struct matrixParameters | ||||||
| * \brief Structure containing information related to the Jacobian Matrix which is utilized by any launched Kernel. | ||||||
| * | ||||||
| * This implementation alleviates the need to pass an excessive number of arguments | ||||||
| * to a Kernel and, instead, packages it into a single structure. While this leads | ||||||
| * to data duplication for a short period of time, this is a much cleaner and resuable approach. | ||||||
| * \author A. Raj | ||||||
| */ | ||||||
| struct matrixParameters{ | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Upper case for classes, structs etc
Suggested change
|
||||||
|
|
||||||
| namespace KernelParameters{ | ||||||
| public: | ||||||
| unsigned long totalRows; /*!< \brief Contains the total number of rows of the Jacbian Matrix. */ | ||||||
| unsigned short blockRowSize; /*!< \brief Contains the row dimensions of the blocks of the Jacobian Matrix. */ | ||||||
| unsigned short blockColSize; /*!< \brief Contains the column dimensions of the blocks of the Jacobian Matrix. */ | ||||||
| unsigned int nChainStart; /*!< \brief Starting partition of the current chain. */ | ||||||
| unsigned int nChainEnd; /*!< \brief Ending partition of the current chain. */ | ||||||
| unsigned short blockSize; /*!< \brief Contains the total number of elements in each block of the Jacbian Matrix. */ | ||||||
| unsigned short activeThreads; /*!< \brief Cotains the number of active threads per iteration during MVP - depending on the | ||||||
| dimensions of the Jacbian Matrix. */ | ||||||
| unsigned short rowsPerBlock; /*!< \brief Number of rows being processed by each thread block. This is equal to the number | ||||||
| of warps present in the block as each row gets assigned a warp. */ | ||||||
|
|
||||||
| inline constexpr int round_up_division(const int multiple, int x) { return ((x + multiple - 1) / multiple); } | ||||||
| matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions, unsigned short rowsPrBlck){ | ||||||
| totalRows = nPointDomain; | ||||||
| blockRowSize = nEqn; | ||||||
| blockColSize = nVar; | ||||||
| nChainStart = 0; | ||||||
| nChainEnd = 0; | ||||||
| blockSize = nVar * nEqn; | ||||||
| activeThreads = nVar * (cudaKernelParameters::CUDA_WARP_SIZE/nVar); | ||||||
| rowsPerBlock = rowsPrBlck; | ||||||
| } | ||||||
|
|
||||||
| /*! | ||||||
| * \brief Returns the memory index in the shared memory array used by the Symmetric Iteration Kernels. | ||||||
| */ | ||||||
| __device__ __forceinline__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){ | ||||||
| return (localRow * blockSize + threadNo); | ||||||
| } | ||||||
|
|
||||||
| /*! | ||||||
| * \brief Returns a boolean value to check whether the row is under the total number of rows and if the | ||||||
| * thread number is within a user-specified thread limit. This is to avoid illegal memory accesses. | ||||||
| */ | ||||||
| __device__ __forceinline__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){ | ||||||
| return (row<totalRows && threadNo<threadLimit); | ||||||
| } | ||||||
|
|
||||||
| /*! | ||||||
| * \brief Returns a boolean value to check whether the row is part of the parallel partition being executed and if the | ||||||
| * thread number is within a user-specified thread limit. This is to avoid illegal memory accesses. | ||||||
| * \param[in] rowInPartition - Represents a boolean that indicates the presence/absence of the row in the partition. | ||||||
| */ | ||||||
| __device__ __forceinline__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){ | ||||||
| return (rowInPartition && threadNo<threadLimit); | ||||||
| } | ||||||
|
|
||||||
| }; | ||||||
|
|
||||||
| static constexpr int MVP_BLOCK_SIZE = 1024; | ||||||
| static constexpr int MVP_WARP_SIZE = 32; | ||||||
| } | ||||||
| /*! | ||||||
| * \brief assert style function that reads return codes after intercepting CUDA API calls. | ||||||
| * It returns the result code and its location if the call is unsuccessful. | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Align with other comments