diff --git a/include/DirectSolver/DirectSolverGive/buildSolverMatrix.inl b/include/DirectSolver/DirectSolverGive/buildSolverMatrix.inl index 59637462..48b2877a 100644 --- a/include/DirectSolver/DirectSolverGive/buildSolverMatrix.inl +++ b/include/DirectSolver/DirectSolverGive/buildSolverMatrix.inl @@ -5,8 +5,8 @@ namespace direct_solver_coo_mumps_give #ifdef GMGPOLAR_USE_MUMPS // When using the MUMPS solver, the matrix is assembled in COO format. -static inline void updateMatrixElement(SparseMatrixCOO& matrix, int ptr, int offset, int row, int column, - double value) +static inline void updateMatrixElement(SparseMatrixCOO& matrix, int ptr, int offset, int row, + int column, double value) { matrix.set_row_index(ptr + offset, row); matrix.set_col_index(ptr + offset, column); diff --git a/include/DirectSolver/DirectSolverTake/buildSolverMatrix.inl b/include/DirectSolver/DirectSolverTake/buildSolverMatrix.inl index 4cdb4487..a144a9c6 100644 --- a/include/DirectSolver/DirectSolverTake/buildSolverMatrix.inl +++ b/include/DirectSolver/DirectSolverTake/buildSolverMatrix.inl @@ -5,8 +5,8 @@ namespace direct_solver_coo_mumps_take #ifdef GMGPOLAR_USE_MUMPS // When using the MUMPS solver, the matrix is assembled in COO format. -static inline void updateMatrixElement(SparseMatrixCOO& matrix, int ptr, int offset, int row, int column, - double value) +static inline void updateMatrixElement(SparseMatrixCOO& matrix, int ptr, int offset, int row, + int column, double value) { matrix.set_row_index(ptr + offset, row); matrix.set_col_index(ptr + offset, column); diff --git a/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/buildInnerBoundaryAsc.inl b/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/buildInnerBoundaryAsc.inl index 0ff8b1b2..ad6b3362 100644 --- a/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/buildInnerBoundaryAsc.inl +++ b/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/buildInnerBoundaryAsc.inl @@ -7,8 +7,8 @@ namespace extrapolated_smoother_give #ifdef GMGPOLAR_USE_MUMPS // When using the MUMPS solver, the matrix is assembled in COO format. -static inline void update_CSR_COO_MatrixElement(SparseMatrixCOO& matrix, int ptr, int offset, int row, - int column, double value) +static inline void update_CSR_COO_MatrixElement(SparseMatrixCOO& matrix, int ptr, int offset, + int row, int column, double value) { matrix.set_row_index(ptr + offset, row); matrix.set_col_index(ptr + offset, column); diff --git a/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/extrapolatedSmootherGive.h b/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/extrapolatedSmootherGive.h index 281a0242..32f448d0 100644 --- a/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/extrapolatedSmootherGive.h +++ b/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/extrapolatedSmootherGive.h @@ -187,6 +187,8 @@ class ExtrapolatedSmootherGive : public ExtrapolatedSmoother /* Line-wise solvers */ /* ----------------- */ + // Functions must be public due to cuda restriction +public: // Solve the linear system: // A_sc * u_sc = f_sc − A_sc^ortho * u_sc^ortho // Parameter mapping: diff --git a/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/solveAscSystem.inl b/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/solveAscSystem.inl index 11cb6d1a..87ebfcb2 100644 --- a/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/solveAscSystem.inl +++ b/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/solveAscSystem.inl @@ -27,13 +27,18 @@ void ExtrapolatedSmootherGive::solveBlackCircleSection(Vector>({0, 0}, + {num_black_circles, grid.ntheta()}), + KOKKOS_LAMBDA(const int circle_task, const int i_theta) { + const int i_r = start_black_circles + circle_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } template @@ -63,13 +68,18 @@ void ExtrapolatedSmootherGive::solveWhiteCircleSection(Vector>({0, 0}, + {num_white_circles, grid.ntheta()}), + KOKKOS_LAMBDA(const int circle_task, const int i_theta) { + const int i_r = start_white_circles + circle_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } template @@ -86,13 +96,20 @@ void ExtrapolatedSmootherGive::solveBlackRadialSection(Vector>({0, grid.numberSmootherCircles()}, + {num_black_radial_lines, grid.nr()}), + KOKKOS_LAMBDA(const int radial_task, const int i_r) { + const int i_theta = start_black_radials + radial_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } template @@ -109,11 +126,18 @@ void ExtrapolatedSmootherGive::solveWhiteRadialSection(Vector>({0, grid.numberSmootherCircles()}, + {num_white_radial_lines, grid.nr()}), + KOKKOS_LAMBDA(const int radial_task, const int i_r) { + const int i_theta = start_white_radials + radial_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } diff --git a/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/buildInnerBoundaryAsc.inl b/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/buildInnerBoundaryAsc.inl index aae19b48..ebf2eb4d 100644 --- a/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/buildInnerBoundaryAsc.inl +++ b/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/buildInnerBoundaryAsc.inl @@ -5,8 +5,8 @@ namespace extrapolated_smoother_take #ifdef GMGPOLAR_USE_MUMPS // When using the MUMPS solver, the matrix is assembled in COO format. -static inline void update_CSR_COO_MatrixElement(SparseMatrixCOO& matrix, int ptr, int offset, int row, - int column, double value) +static inline void update_CSR_COO_MatrixElement(SparseMatrixCOO& matrix, int ptr, int offset, + int row, int column, double value) { matrix.set_row_index(ptr + offset, row); matrix.set_col_index(ptr + offset, column); diff --git a/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/extrapolatedSmootherTake.h b/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/extrapolatedSmootherTake.h index 8f5e78fe..841d8203 100644 --- a/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/extrapolatedSmootherTake.h +++ b/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/extrapolatedSmootherTake.h @@ -183,6 +183,8 @@ class ExtrapolatedSmootherTake : public ExtrapolatedSmoother /* Line-wise solvers */ /* ----------------- */ + // Functions must be public due to cuda restriction +public: // Solve the linear system: // A_sc * u_sc = f_sc − A_sc^ortho * u_sc^ortho // Parameter mapping: diff --git a/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/solveAscSystem.inl b/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/solveAscSystem.inl index 5b677e18..6f5c7679 100644 --- a/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/solveAscSystem.inl +++ b/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/solveAscSystem.inl @@ -27,13 +27,18 @@ void ExtrapolatedSmootherTake::solveBlackCircleSection(Vector>({0, 0}, + {num_black_circles, grid.ntheta()}), + KOKKOS_LAMBDA(const int circle_task, const int i_theta) { + const int i_r = start_black_circles + circle_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } template @@ -63,13 +68,18 @@ void ExtrapolatedSmootherTake::solveWhiteCircleSection(Vector>({0, 0}, + {num_white_circles, grid.ntheta()}), + KOKKOS_LAMBDA(const int circle_task, const int i_theta) { + const int i_r = start_white_circles + circle_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } template @@ -86,13 +96,20 @@ void ExtrapolatedSmootherTake::solveBlackRadialSection(Vector>({0, grid.numberSmootherCircles()}, + {num_black_radial_lines, grid.nr()}), + KOKKOS_LAMBDA(const int radial_task, const int i_r) { + const int i_theta = start_black_radials + radial_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } template @@ -109,11 +126,18 @@ void ExtrapolatedSmootherTake::solveWhiteRadialSection(Vector>({0, grid.numberSmootherCircles()}, + {num_white_radial_lines, grid.nr()}), + KOKKOS_LAMBDA(const int radial_task, const int i_r) { + const int i_theta = start_white_radials + radial_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } diff --git a/include/Smoother/SmootherGive/buildInnerBoundaryAsc.inl b/include/Smoother/SmootherGive/buildInnerBoundaryAsc.inl index dd92f089..c59c0359 100644 --- a/include/Smoother/SmootherGive/buildInnerBoundaryAsc.inl +++ b/include/Smoother/SmootherGive/buildInnerBoundaryAsc.inl @@ -5,8 +5,8 @@ namespace smoother_give #ifdef GMGPOLAR_USE_MUMPS // When using the MUMPS solver, the matrix is assembled in COO format. -static inline void update_CSR_COO_MatrixElement(SparseMatrixCOO& matrix, int ptr, int offset, int row, - int column, double value) +static inline void update_CSR_COO_MatrixElement(SparseMatrixCOO& matrix, int ptr, int offset, + int row, int column, double value) { matrix.set_row_index(ptr + offset, row); matrix.set_col_index(ptr + offset, column); diff --git a/include/Smoother/SmootherGive/smootherGive.h b/include/Smoother/SmootherGive/smootherGive.h index d5e12ddd..c2988286 100644 --- a/include/Smoother/SmootherGive/smootherGive.h +++ b/include/Smoother/SmootherGive/smootherGive.h @@ -180,6 +180,8 @@ class SmootherGive : public Smoother /* Line-wise solvers */ /* ----------------- */ + // Functions must be public due to cuda restriction +public: // Solve the linear system: // A_sc * u_sc = f_sc − A_sc^ortho * u_sc^ortho // Parameter mapping: diff --git a/include/Smoother/SmootherGive/solveAscSystem.inl b/include/Smoother/SmootherGive/solveAscSystem.inl index 364fcdb7..2a010874 100644 --- a/include/Smoother/SmootherGive/solveAscSystem.inl +++ b/include/Smoother/SmootherGive/solveAscSystem.inl @@ -22,13 +22,18 @@ void SmootherGive::solveBlackCircleSection(Vector x, Vec } // Move updated values to x - int start_black_circles = is_inner_circle_black ? 0 : 1; -#pragma omp parallel for num_threads(num_omp_threads) - for (int i_r = start_black_circles; i_r < grid.numberSmootherCircles(); i_r += 2) { - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - x[grid.index(i_r, i_theta)] = temp[grid.index(i_r, i_theta)]; - } - } + const int start_black_circles = is_inner_circle_black ? 0 : 1; + const int num_black_circles = (grid.numberSmootherCircles() - start_black_circles + 1) / 2; + Kokkos::parallel_for( + "SmootherGive: moveUpdatedValues (Black Circular)", + Kokkos::MDRangePolicy>({0, 0}, + {num_black_circles, grid.ntheta()}), + KOKKOS_LAMBDA(const int circle_task, const int i_theta) { + const int i_r = start_black_circles + circle_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } template @@ -53,13 +58,18 @@ void SmootherGive::solveWhiteCircleSection(Vector x, Vec } // Move updated values to x - int start_white_circles = is_inner_circle_white ? 0 : 1; -#pragma omp parallel for num_threads(num_omp_threads) - for (int i_r = start_white_circles; i_r < grid.numberSmootherCircles(); i_r += 2) { - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - x[grid.index(i_r, i_theta)] = temp[grid.index(i_r, i_theta)]; - } - } + const int start_white_circles = is_inner_circle_white ? 0 : 1; + const int num_white_circles = (grid.numberSmootherCircles() - start_white_circles + 1) / 2; + Kokkos::parallel_for( + "SmootherGive: moveUpdatedValues (White Circular)", + Kokkos::MDRangePolicy>({0, 0}, + {num_white_circles, grid.ntheta()}), + KOKKOS_LAMBDA(const int circle_task, const int i_theta) { + const int i_r = start_white_circles + circle_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } template @@ -76,13 +86,20 @@ void SmootherGive::solveBlackRadialSection(Vector x, Vec int batch_stride = 2; radial_tridiagonal_solver_.solve(radial_section, batch_offset, batch_stride); -// Move updated values to x -#pragma omp parallel for num_threads(num_omp_threads) - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta += 2) { - for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { - x[grid.index(i_r, i_theta)] = temp[grid.index(i_r, i_theta)]; - } - } + // Move updated values to x + assert(grid.ntheta() % 2 == 0); + const int start_black_radials = 0; + const int num_black_radial_lines = grid.ntheta() / 2; + Kokkos::parallel_for( + "SmootherGive: moveUpdatedValues (Black Radial)", + Kokkos::MDRangePolicy>({0, grid.numberSmootherCircles()}, + {num_black_radial_lines, grid.nr()}), + KOKKOS_LAMBDA(const int radial_task, const int i_r) { + const int i_theta = start_black_radials + radial_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } template @@ -99,11 +116,18 @@ void SmootherGive::solveWhiteRadialSection(Vector x, Vec int batch_stride = 2; radial_tridiagonal_solver_.solve(radial_section, batch_offset, batch_stride); -// Move updated values to x -#pragma omp parallel for num_threads(num_omp_threads) - for (int i_theta = 1; i_theta < grid.ntheta(); i_theta += 2) { - for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { - x[grid.index(i_r, i_theta)] = temp[grid.index(i_r, i_theta)]; - } - } + // Move updated values to x + assert(grid.ntheta() % 2 == 0); + const int start_white_radials = 1; + const int num_white_radial_lines = grid.ntheta() / 2; + Kokkos::parallel_for( + "SmootherGive: moveUpdatedValues (White Radial)", + Kokkos::MDRangePolicy>({0, grid.numberSmootherCircles()}, + {num_white_radial_lines, grid.nr()}), + KOKKOS_LAMBDA(const int radial_task, const int i_r) { + const int i_theta = start_white_radials + radial_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } diff --git a/include/Smoother/SmootherTake/buildInnerBoundaryAsc.inl b/include/Smoother/SmootherTake/buildInnerBoundaryAsc.inl index b2f3cfd3..243962c9 100644 --- a/include/Smoother/SmootherTake/buildInnerBoundaryAsc.inl +++ b/include/Smoother/SmootherTake/buildInnerBoundaryAsc.inl @@ -5,12 +5,12 @@ namespace smoother_take #ifdef GMGPOLAR_USE_MUMPS // When using the MUMPS solver, the matrix is assembled in COO format. -static inline void update_CSR_COO_MatrixElement(SparseMatrixCOO& matrix, int ptr, int offset, int row, - int column, double value) +static inline void update_CSR_COO_MatrixElement(SparseMatrixCOO& matrix, int ptr, int offset, + int row, int column, double value) { matrix.set_row_index(ptr + offset, row); matrix.set_col_index(ptr + offset, column); - matrix.set_value(ptr + offset , value); + matrix.set_value(ptr + offset, value); } #else // When using the in-house solver, the matrix is stored in CSR format. diff --git a/include/Smoother/SmootherTake/smootherTake.h b/include/Smoother/SmootherTake/smootherTake.h index 63967288..e5f02849 100644 --- a/include/Smoother/SmootherTake/smootherTake.h +++ b/include/Smoother/SmootherTake/smootherTake.h @@ -165,7 +165,7 @@ class SmootherTake : public Smoother /* Orthogonal application */ /* ---------------------- */ - // Public is required as Cuda needs to be abe to get the address of functions enclosing lambda functions + // Public is required as Cuda needs to be able to get the address of functions enclosing lambda functions public: // Compute temp = f_sc − A_sc^ortho * u_sc^ortho (precomputed right-hand side) // where x = u_sc and rhs = f_sc @@ -174,7 +174,6 @@ class SmootherTake : public Smoother void applyAscOrthoBlackRadialSection(ConstVector x, ConstVector rhs, Vector temp); void applyAscOrthoWhiteRadialSection(ConstVector x, ConstVector rhs, Vector temp); -private: /* ----------------- */ /* Line-wise solvers */ /* ----------------- */ diff --git a/include/Smoother/SmootherTake/solveAscSystem.inl b/include/Smoother/SmootherTake/solveAscSystem.inl index 6d149b15..c53abdbc 100644 --- a/include/Smoother/SmootherTake/solveAscSystem.inl +++ b/include/Smoother/SmootherTake/solveAscSystem.inl @@ -22,13 +22,18 @@ void SmootherTake::solveBlackCircleSection(Vector x, Vec } // Move updated values to x - int start_black_circles = is_inner_circle_black ? 0 : 1; -#pragma omp parallel for num_threads(num_omp_threads) - for (int i_r = start_black_circles; i_r < grid.numberSmootherCircles(); i_r += 2) { - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - x[grid.index(i_r, i_theta)] = temp[grid.index(i_r, i_theta)]; - } - } + const int start_black_circles = is_inner_circle_black ? 0 : 1; + const int num_black_circles = (grid.numberSmootherCircles() - start_black_circles + 1) / 2; + Kokkos::parallel_for( + "SmootherTake: moveUpdatedValues (Black Circular)", + Kokkos::MDRangePolicy>({0, 0}, + {num_black_circles, grid.ntheta()}), + KOKKOS_LAMBDA(const int circle_task, const int i_theta) { + const int i_r = start_black_circles + circle_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } template @@ -53,13 +58,18 @@ void SmootherTake::solveWhiteCircleSection(Vector x, Vec } // Move updated values to x - int start_white_circles = is_inner_circle_white ? 0 : 1; -#pragma omp parallel for num_threads(num_omp_threads) - for (int i_r = start_white_circles; i_r < grid.numberSmootherCircles(); i_r += 2) { - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - x[grid.index(i_r, i_theta)] = temp[grid.index(i_r, i_theta)]; - } - } + const int start_white_circles = is_inner_circle_white ? 0 : 1; + const int num_white_circles = (grid.numberSmootherCircles() - start_white_circles + 1) / 2; + Kokkos::parallel_for( + "SmootherTake: moveUpdatedValues (White Circular)", + Kokkos::MDRangePolicy>({0, 0}, + {num_white_circles, grid.ntheta()}), + KOKKOS_LAMBDA(const int circle_task, const int i_theta) { + const int i_r = start_white_circles + circle_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } template @@ -76,13 +86,20 @@ void SmootherTake::solveBlackRadialSection(Vector x, Vec int batch_stride = 2; radial_tridiagonal_solver_.solve(radial_section, batch_offset, batch_stride); -// Move updated values to x -#pragma omp parallel for num_threads(num_omp_threads) - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta += 2) { - for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { - x[grid.index(i_r, i_theta)] = temp[grid.index(i_r, i_theta)]; - } - } + // Move updated values to x + assert(grid.ntheta() % 2 == 0); + const int start_black_radials = 0; + const int num_black_radial_lines = grid.ntheta() / 2; + Kokkos::parallel_for( + "SmootherTake: moveUpdatedValues (Black Radial)", + Kokkos::MDRangePolicy>({0, grid.numberSmootherCircles()}, + {num_black_radial_lines, grid.nr()}), + KOKKOS_LAMBDA(const int radial_task, const int i_r) { + const int i_theta = start_black_radials + radial_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); } template @@ -99,11 +116,18 @@ void SmootherTake::solveWhiteRadialSection(Vector x, Vec int batch_stride = 2; radial_tridiagonal_solver_.solve(radial_section, batch_offset, batch_stride); -// Move updated values to x -#pragma omp parallel for num_threads(num_omp_threads) - for (int i_theta = 1; i_theta < grid.ntheta(); i_theta += 2) { - for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { - x[grid.index(i_r, i_theta)] = temp[grid.index(i_r, i_theta)]; - } - } + // Move updated values to x + assert(grid.ntheta() % 2 == 0); + const int start_white_radials = 1; + const int num_white_radial_lines = grid.ntheta() / 2; + Kokkos::parallel_for( + "SmootherTake: moveUpdatedValues (White Radial)", + Kokkos::MDRangePolicy>({0, grid.numberSmootherCircles()}, + {num_white_radial_lines, grid.nr()}), + KOKKOS_LAMBDA(const int radial_task, const int i_r) { + const int i_theta = start_white_radials + radial_task * 2; + const int index = grid.index(i_r, i_theta); + x[index] = temp[index]; + }); + Kokkos::fence(); }