xref: /petsc/src/ksp/ksp/tests/raja/ex1.raja.cxx (revision bd9d6ae223073035433861335ae9dbc9b7be559f)
1 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
2 // Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
3 // and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
4 //
5 // SPDX-License-Identifier: (BSD-3-Clause)
6 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
7 
8 #include <cstdlib>
9 #include <cstdio>
10 #include <cstring>
11 
12 #include <iostream>
13 #include <cmath>
14 
15 #include "RAJA/RAJA.hpp"
16 
17 #include "memoryManager.hpp"
18 
19 /*
20  * Jacobi Example
21  *
22  * ----[Details]--------------------
23  * This code uses a five point finite difference stencil
24  * to discretize the following boundary value problem
25  *
26  * U_xx + U_yy = f on [0,1] x [0,1].
27  *
28  * The right-hand side is chosen to be
29  * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y).
30  *
31  * A structured grid is used to discretize the domain
32  * [0,1] x [0,1]. Values inside the domain are computed
33  * using the Jacobi method to solve the associated
34  * linear system. The scheme is invoked until the l_2
35  * difference of subsequent iterations is below a
36  * tolerance.
37  *
38  * The scheme is implemented by allocating two arrays
39  * (I, Iold) and initialized to zero. The first set of
40  * nested for loops apply an iteration of the Jacobi
41  * scheme. The scheme is only applied to the interior
42  * nodes.
43  *
44  * The second set of nested for loops is used to
45  * update Iold and compute the l_2 norm of the
46  * difference of the iterates.
47  *
48  * Computing the l_2 norm requires a reduction operation.
49  * To simplify the reduction procedure, the RAJA API
50  * introduces thread safe variables.
51  *
52  * ----[RAJA Concepts]---------------
53  * - Forall::nested loop
54  * - RAJA Reduction
55  *
56  */
57 
58 /*
59  *  ----[Constant Values]-----
60  * CUDA_BLOCK_SIZE_X - Number of threads in the
61  *                     x-dimension of a cuda thread block
62  *
63  * CUDA_BLOCK_SIZE_Y - Number of threads in the
64  *                     y-dimension of a cuda thread block
65  *
66  * CUDA_BLOCK_SIZE   - Number of threads per threads block
67 */
68 #if defined(RAJA_ENABLE_CUDA)
69 const int CUDA_BLOCK_SIZE = 256;
70 #endif
71 
72 #if defined(RAJA_ENABLE_HIP)
73 const int HIP_BLOCK_SIZE = 256;
74 #endif
75 
76 //
77 //  Struct to hold grid info
78 //  o - Origin in a cartesian dimension
79 //  h - Spacing between grid points
80 //  n - Number of grid points
81 //
82 struct grid_s {
83   double o, h;
84   int    n;
85 };
86 
87 //
88 // ----[Functions]---------
89 // solution   - Function for the analytic solution
90 // computeErr - Displays the maximum error in the solution
91 //
92 double solution(double x, double y);
93 void   computeErr(double *I, grid_s grid);
94 
main(int RAJA_UNUSED_ARG (argc),char ** RAJA_UNUSED_ARG (argv[]))95 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
96 {
97   std::cout << "Jacobi Example" << std::endl;
98 
99   /*
100    * ----[Solver Parameters]------------
101    * tol       - Method terminates once the norm is less than tol
102    * N         - Number of unknown gridpoints per cartesian dimension
103    * NN        - Total number of gridpoints on the grid
104    * maxIter   - Maximum number of iterations to be taken
105    *
106    * resI2     - Residual
107    * iteration - Iteration number
108    * grid_s    - Struct with grid information for a cartesian dimension
109   */
110   double tol = 1e-10;
111 
112   int N       = 50;
113   int NN      = (N + 2) * (N + 2);
114   int maxIter = 100000;
115 
116   double resI2;
117   int    iteration;
118 
119   grid_s gridx;
120   gridx.o = 0.0;
121   gridx.h = 1.0 / (N + 1.0);
122   gridx.n = N + 2;
123 
124   //
125   //I, Iold - Holds iterates of Jacobi method
126   //
127   double *I    = memoryManager::allocate<double>(NN);
128   double *Iold = memoryManager::allocate<double>(NN);
129 
130   memset(I, 0, NN * sizeof(double));
131   memset(Iold, 0, NN * sizeof(double));
132 
133   printf("Standard  C++ Loop \n");
134   resI2     = 1;
135   iteration = 0;
136 
137   while (resI2 > tol * tol) {
138     //
139     // Jacobi Iteration
140     //
141     for (int n = 1; n <= N; ++n) {
142       for (int m = 1; m <= N; ++m) {
143         double x = gridx.o + m * gridx.h;
144         double y = gridx.o + n * gridx.h;
145 
146         double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
147 
148         int id = n * (N + 2) + m;
149         I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
150       }
151     }
152 
153     //
154     // Compute residual and update Iold
155     //
156     resI2 = 0.0;
157     for (int k = 0; k < NN; k++) {
158       resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
159       Iold[k] = I[k];
160     }
161 
162     if (iteration > maxIter) {
163       printf("Standard C++ Loop - Maxed out on iterations \n");
164       exit(-1);
165     }
166 
167     iteration++;
168   }
169   computeErr(I, gridx);
170   printf("No of iterations: %d \n \n", iteration);
171 
172   //
173   // RAJA loop calls may be shortened by predefining policies
174   //
175   RAJA::RangeSegment gridRange(0, NN);
176   RAJA::RangeSegment jacobiRange(1, N + 1);
177 
178   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
179 
180   printf("RAJA: Sequential Policy - Nested ForallN \n");
181   resI2     = 1;
182   iteration = 0;
183   memset(I, 0, NN * sizeof(double));
184   memset(Iold, 0, NN * sizeof(double));
185 
186   /*
187    *  Sequential Jacobi Iteration.
188    *
189    *  Note that a RAJA ReduceSum object is used to accumulate the sum
190    *  for the residual. Since the loop is run sequentially, this is
191    *  not strictly necessary. It is done here for consistency and
192    *  comparison with other RAJA variants in this example.
193    */
194   while (resI2 > tol * tol) {
195     RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
196       double x = gridx.o + m * gridx.h;
197       double y = gridx.o + n * gridx.h;
198 
199       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
200 
201       int id = n * (N + 2) + m;
202       I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
203     });
204 
205     RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
206     RAJA::forall<RAJA::seq_exec>(gridRange, [=](RAJA::Index_type k) {
207       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
208       Iold[k] = I[k];
209     });
210 
211     resI2 = RAJA_resI2;
212     if (iteration > maxIter) {
213       printf("Jacobi: Sequential - Maxed out on iterations! \n");
214       exit(-1);
215     }
216     iteration++;
217   }
218   computeErr(I, gridx);
219   printf("No of iterations: %d \n \n", iteration);
220 
221 #if defined(RAJA_ENABLE_OPENMP)
222   printf("RAJA: OpenMP Policy - Nested ForallN \n");
223   resI2     = 1;
224   iteration = 0;
225   memset(I, 0, NN * sizeof(double));
226   memset(Iold, 0, NN * sizeof(double));
227 
228   /*
229    *  OpenMP parallel Jacobi Iteration.
230    *
231    *  ----[RAJA Policies]-----------
232    *  RAJA::omp_collapse_for_exec -
233    *  introduced a nested region
234    *
235    *  Note that OpenMP RAJA ReduceSum object performs the reduction
236    *  operation for the residual in a thread-safe manner.
237    */
238 
239   using jacobiOmpNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
240 
241   while (resI2 > tol * tol) {
242     RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
243       double x = gridx.o + m * gridx.h;
244       double y = gridx.o + n * gridx.h;
245 
246       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
247 
248       int id = n * (N + 2) + m;
249       I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
250     });
251 
252     RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
253 
254     RAJA::forall<RAJA::omp_parallel_for_exec>(gridRange, [=](RAJA::Index_type k) {
255       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
256       Iold[k] = I[k];
257     });
258 
259     resI2 = RAJA_resI2;
260     if (iteration > maxIter) {
261       printf("Jacobi: OpenMP - Maxed out on iterations! \n");
262       exit(-1);
263     }
264     iteration++;
265   }
266   computeErr(I, gridx);
267   printf("No of iterations: %d \n \n", iteration);
268 #endif
269 
270 #if defined(RAJA_ENABLE_CUDA)
271   /*
272    *  CUDA Jacobi Iteration.
273    *
274    *  ----[RAJA Policies]-----------
275    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
276    *  define the mapping of loop iterations to GPU thread blocks
277    *
278    *  Note that CUDA RAJA ReduceSum object performs the reduction
279    *  operation for the residual in a thread-safe manner on the GPU.
280    */
281 
282   printf("RAJA: CUDA Policy - Nested ForallN \n");
283 
284   using jacobiCUDANestedPolicy = RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;
285 
286   resI2     = 1;
287   iteration = 0;
288   memset(I, 0, NN * sizeof(double));
289   memset(Iold, 0, NN * sizeof(double));
290 
291   while (resI2 > tol * tol) {
292     //
293     // Jacobi Iteration
294     //
295     RAJA::kernel<jacobiCUDANestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
296       double x = gridx.o + m * gridx.h;
297       double y = gridx.o + n * gridx.h;
298 
299       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
300 
301       int id = n * (N + 2) + m;
302       I[id]  = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
303     });
304 
305     //
306     // Compute residual and update Iold
307     //
308     RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0);
309     RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
310       RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
311       Iold[k] = I[k];
312     });
313 
314     resI2 = RAJA_resI2;
315 
316     if (iteration > maxIter) {
317       printf("RAJA: CUDA - Maxed out on iterations! \n");
318       exit(-1);
319     }
320     iteration++;
321   }
322   cudaDeviceSynchronize();
323   computeErr(I, gridx);
324   printf("No of iterations: %d \n \n", iteration);
325 #endif
326 
327 #if defined(RAJA_ENABLE_HIP)
328   /*
329    *  HIP Jacobi Iteration.
330    *
331    *  ----[RAJA Policies]-----------
332    *  RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
333    *  define the mapping of loop iterations to GPU thread blocks
334    *
335    *  Note that HIP RAJA ReduceSum object performs the reduction
336    *  operation for the residual in a thread-safe manner on the GPU.
337    */
338 
339   printf("RAJA: HIP Policy - Nested ForallN \n");
340 
341   using jacobiHIPNestedPolicy = RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_y_direct, RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;
342 
343   resI2     = 1;
344   iteration = 0;
345   memset(I, 0, NN * sizeof(double));
346   memset(Iold, 0, NN * sizeof(double));
347 
348   double *d_I    = memoryManager::allocate_gpu<double>(NN);
349   double *d_Iold = memoryManager::allocate_gpu<double>(NN);
350   hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice));
351   hipErrchk(hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice));
352 
353   while (resI2 > tol * tol) {
354     //
355     // Jacobi Iteration
356     //
357     RAJA::kernel<jacobiHIPNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
358       double x = gridx.o + m * gridx.h;
359       double y = gridx.o + n * gridx.h;
360 
361       double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
362 
363       int id  = n * (N + 2) + m;
364       d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1] + d_Iold[id + 1]);
365     });
366 
367     //
368     // Compute residual and update Iold
369     //
370     RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0);
371     RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
372       RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]);
373       d_Iold[k] = d_I[k];
374     });
375 
376     resI2 = RAJA_resI2;
377 
378     if (iteration > maxIter) {
379       printf("RAJA: HIP - Maxed out on iterations! \n");
380       exit(-1);
381     }
382     iteration++;
383   }
384   hipDeviceSynchronize();
385   hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost));
386   computeErr(I, gridx);
387   printf("No of iterations: %d \n \n", iteration);
388 
389   memoryManager::deallocate_gpu(d_I);
390   memoryManager::deallocate_gpu(d_Iold);
391 #endif
392 
393   memoryManager::deallocate(I);
394   memoryManager::deallocate(Iold);
395 
396   return 0;
397 }
398 
399 //
400 // Function for the analytic solution
401 //
solution(double x,double y)402 double solution(double x, double y)
403 {
404   return x * y * exp(x - y) * (1 - x) * (1 - y);
405 }
406 
407 //
408 // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
409 //
computeErr(double * I,grid_s grid)410 void computeErr(double *I, grid_s grid)
411 {
412   RAJA::RangeSegment                        gridRange(0, grid.n);
413   RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
414 
415   using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
416 
417   RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange, gridRange), [=](RAJA::Index_type ty, RAJA::Index_type tx) {
418     int    id    = tx + grid.n * ty;
419     double x     = grid.o + tx * grid.h;
420     double y     = grid.o + ty * grid.h;
421     double myErr = std::abs(I[id] - solution(x, y));
422     tMax.max(myErr);
423   });
424 
425   double l2err = tMax;
426   printf("Max error = %lg, h = %f \n", l2err, grid.h);
427 }
428 
429 /*TEST
430 
431     test:
432       requires: raja !cuda
433       filter: sed -e "/RAJA: OpenMP Policy/,+3d"
434 
435     test:
436       suffix: 2
437       requires: raja cuda
438       filter: sed -e "/RAJA: OpenMP Policy/,+3d"
439 
440 TEST*/
441