1 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
2 // Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
3 // and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
4 //
5 // SPDX-License-Identifier: (BSD-3-Clause)
6 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
7
8 #include <cstdlib>
9 #include <cstdio>
10 #include <cstring>
11
12 #include <iostream>
13 #include <cmath>
14
15 #include "RAJA/RAJA.hpp"
16
17 #include "memoryManager.hpp"
18
19 /*
20 * Jacobi Example
21 *
22 * ----[Details]--------------------
23 * This code uses a five point finite difference stencil
24 * to discretize the following boundary value problem
25 *
26 * U_xx + U_yy = f on [0,1] x [0,1].
27 *
28 * The right-hand side is chosen to be
29 * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y).
30 *
31 * A structured grid is used to discretize the domain
32 * [0,1] x [0,1]. Values inside the domain are computed
33 * using the Jacobi method to solve the associated
34 * linear system. The scheme is invoked until the l_2
35 * difference of subsequent iterations is below a
36 * tolerance.
37 *
38 * The scheme is implemented by allocating two arrays
39 * (I, Iold) and initialized to zero. The first set of
40 * nested for loops apply an iteration of the Jacobi
41 * scheme. The scheme is only applied to the interior
42 * nodes.
43 *
44 * The second set of nested for loops is used to
45 * update Iold and compute the l_2 norm of the
46 * difference of the iterates.
47 *
48 * Computing the l_2 norm requires a reduction operation.
49 * To simplify the reduction procedure, the RAJA API
50 * introduces thread safe variables.
51 *
52 * ----[RAJA Concepts]---------------
53 * - Forall::nested loop
54 * - RAJA Reduction
55 *
56 */
57
58 /*
59 * ----[Constant Values]-----
60 * CUDA_BLOCK_SIZE_X - Number of threads in the
61 * x-dimension of a cuda thread block
62 *
63 * CUDA_BLOCK_SIZE_Y - Number of threads in the
64 * y-dimension of a cuda thread block
65 *
66 * CUDA_BLOCK_SIZE - Number of threads per threads block
67 */
68 #if defined(RAJA_ENABLE_CUDA)
69 const int CUDA_BLOCK_SIZE = 256;
70 #endif
71
72 #if defined(RAJA_ENABLE_HIP)
73 const int HIP_BLOCK_SIZE = 256;
74 #endif
75
76 //
77 // Struct to hold grid info
78 // o - Origin in a cartesian dimension
79 // h - Spacing between grid points
80 // n - Number of grid points
81 //
82 struct grid_s {
83 double o, h;
84 int n;
85 };
86
87 //
88 // ----[Functions]---------
89 // solution - Function for the analytic solution
90 // computeErr - Displays the maximum error in the solution
91 //
92 double solution(double x, double y);
93 void computeErr(double *I, grid_s grid);
94
main(int RAJA_UNUSED_ARG (argc),char ** RAJA_UNUSED_ARG (argv[]))95 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
96 {
97 std::cout << "Jacobi Example" << std::endl;
98
99 /*
100 * ----[Solver Parameters]------------
101 * tol - Method terminates once the norm is less than tol
102 * N - Number of unknown gridpoints per cartesian dimension
103 * NN - Total number of gridpoints on the grid
104 * maxIter - Maximum number of iterations to be taken
105 *
106 * resI2 - Residual
107 * iteration - Iteration number
108 * grid_s - Struct with grid information for a cartesian dimension
109 */
110 double tol = 1e-10;
111
112 int N = 50;
113 int NN = (N + 2) * (N + 2);
114 int maxIter = 100000;
115
116 double resI2;
117 int iteration;
118
119 grid_s gridx;
120 gridx.o = 0.0;
121 gridx.h = 1.0 / (N + 1.0);
122 gridx.n = N + 2;
123
124 //
125 //I, Iold - Holds iterates of Jacobi method
126 //
127 double *I = memoryManager::allocate<double>(NN);
128 double *Iold = memoryManager::allocate<double>(NN);
129
130 memset(I, 0, NN * sizeof(double));
131 memset(Iold, 0, NN * sizeof(double));
132
133 printf("Standard C++ Loop \n");
134 resI2 = 1;
135 iteration = 0;
136
137 while (resI2 > tol * tol) {
138 //
139 // Jacobi Iteration
140 //
141 for (int n = 1; n <= N; ++n) {
142 for (int m = 1; m <= N; ++m) {
143 double x = gridx.o + m * gridx.h;
144 double y = gridx.o + n * gridx.h;
145
146 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
147
148 int id = n * (N + 2) + m;
149 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
150 }
151 }
152
153 //
154 // Compute residual and update Iold
155 //
156 resI2 = 0.0;
157 for (int k = 0; k < NN; k++) {
158 resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
159 Iold[k] = I[k];
160 }
161
162 if (iteration > maxIter) {
163 printf("Standard C++ Loop - Maxed out on iterations \n");
164 exit(-1);
165 }
166
167 iteration++;
168 }
169 computeErr(I, gridx);
170 printf("No of iterations: %d \n \n", iteration);
171
172 //
173 // RAJA loop calls may be shortened by predefining policies
174 //
175 RAJA::RangeSegment gridRange(0, NN);
176 RAJA::RangeSegment jacobiRange(1, N + 1);
177
178 using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
179
180 printf("RAJA: Sequential Policy - Nested ForallN \n");
181 resI2 = 1;
182 iteration = 0;
183 memset(I, 0, NN * sizeof(double));
184 memset(Iold, 0, NN * sizeof(double));
185
186 /*
187 * Sequential Jacobi Iteration.
188 *
189 * Note that a RAJA ReduceSum object is used to accumulate the sum
190 * for the residual. Since the loop is run sequentially, this is
191 * not strictly necessary. It is done here for consistency and
192 * comparison with other RAJA variants in this example.
193 */
194 while (resI2 > tol * tol) {
195 RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
196 double x = gridx.o + m * gridx.h;
197 double y = gridx.o + n * gridx.h;
198
199 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
200
201 int id = n * (N + 2) + m;
202 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
203 });
204
205 RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
206 RAJA::forall<RAJA::seq_exec>(gridRange, [=](RAJA::Index_type k) {
207 RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
208 Iold[k] = I[k];
209 });
210
211 resI2 = RAJA_resI2;
212 if (iteration > maxIter) {
213 printf("Jacobi: Sequential - Maxed out on iterations! \n");
214 exit(-1);
215 }
216 iteration++;
217 }
218 computeErr(I, gridx);
219 printf("No of iterations: %d \n \n", iteration);
220
221 #if defined(RAJA_ENABLE_OPENMP)
222 printf("RAJA: OpenMP Policy - Nested ForallN \n");
223 resI2 = 1;
224 iteration = 0;
225 memset(I, 0, NN * sizeof(double));
226 memset(Iold, 0, NN * sizeof(double));
227
228 /*
229 * OpenMP parallel Jacobi Iteration.
230 *
231 * ----[RAJA Policies]-----------
232 * RAJA::omp_collapse_for_exec -
233 * introduced a nested region
234 *
235 * Note that OpenMP RAJA ReduceSum object performs the reduction
236 * operation for the residual in a thread-safe manner.
237 */
238
239 using jacobiOmpNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
240
241 while (resI2 > tol * tol) {
242 RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
243 double x = gridx.o + m * gridx.h;
244 double y = gridx.o + n * gridx.h;
245
246 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
247
248 int id = n * (N + 2) + m;
249 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
250 });
251
252 RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
253
254 RAJA::forall<RAJA::omp_parallel_for_exec>(gridRange, [=](RAJA::Index_type k) {
255 RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
256 Iold[k] = I[k];
257 });
258
259 resI2 = RAJA_resI2;
260 if (iteration > maxIter) {
261 printf("Jacobi: OpenMP - Maxed out on iterations! \n");
262 exit(-1);
263 }
264 iteration++;
265 }
266 computeErr(I, gridx);
267 printf("No of iterations: %d \n \n", iteration);
268 #endif
269
270 #if defined(RAJA_ENABLE_CUDA)
271 /*
272 * CUDA Jacobi Iteration.
273 *
274 * ----[RAJA Policies]-----------
275 * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
276 * define the mapping of loop iterations to GPU thread blocks
277 *
278 * Note that CUDA RAJA ReduceSum object performs the reduction
279 * operation for the residual in a thread-safe manner on the GPU.
280 */
281
282 printf("RAJA: CUDA Policy - Nested ForallN \n");
283
284 using jacobiCUDANestedPolicy = RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;
285
286 resI2 = 1;
287 iteration = 0;
288 memset(I, 0, NN * sizeof(double));
289 memset(Iold, 0, NN * sizeof(double));
290
291 while (resI2 > tol * tol) {
292 //
293 // Jacobi Iteration
294 //
295 RAJA::kernel<jacobiCUDANestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
296 double x = gridx.o + m * gridx.h;
297 double y = gridx.o + n * gridx.h;
298
299 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
300
301 int id = n * (N + 2) + m;
302 I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
303 });
304
305 //
306 // Compute residual and update Iold
307 //
308 RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0);
309 RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
310 RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
311 Iold[k] = I[k];
312 });
313
314 resI2 = RAJA_resI2;
315
316 if (iteration > maxIter) {
317 printf("RAJA: CUDA - Maxed out on iterations! \n");
318 exit(-1);
319 }
320 iteration++;
321 }
322 cudaDeviceSynchronize();
323 computeErr(I, gridx);
324 printf("No of iterations: %d \n \n", iteration);
325 #endif
326
327 #if defined(RAJA_ENABLE_HIP)
328 /*
329 * HIP Jacobi Iteration.
330 *
331 * ----[RAJA Policies]-----------
332 * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
333 * define the mapping of loop iterations to GPU thread blocks
334 *
335 * Note that HIP RAJA ReduceSum object performs the reduction
336 * operation for the residual in a thread-safe manner on the GPU.
337 */
338
339 printf("RAJA: HIP Policy - Nested ForallN \n");
340
341 using jacobiHIPNestedPolicy = RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_y_direct, RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;
342
343 resI2 = 1;
344 iteration = 0;
345 memset(I, 0, NN * sizeof(double));
346 memset(Iold, 0, NN * sizeof(double));
347
348 double *d_I = memoryManager::allocate_gpu<double>(NN);
349 double *d_Iold = memoryManager::allocate_gpu<double>(NN);
350 hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice));
351 hipErrchk(hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice));
352
353 while (resI2 > tol * tol) {
354 //
355 // Jacobi Iteration
356 //
357 RAJA::kernel<jacobiHIPNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
358 double x = gridx.o + m * gridx.h;
359 double y = gridx.o + n * gridx.h;
360
361 double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
362
363 int id = n * (N + 2) + m;
364 d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1] + d_Iold[id + 1]);
365 });
366
367 //
368 // Compute residual and update Iold
369 //
370 RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0);
371 RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
372 RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]);
373 d_Iold[k] = d_I[k];
374 });
375
376 resI2 = RAJA_resI2;
377
378 if (iteration > maxIter) {
379 printf("RAJA: HIP - Maxed out on iterations! \n");
380 exit(-1);
381 }
382 iteration++;
383 }
384 hipDeviceSynchronize();
385 hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost));
386 computeErr(I, gridx);
387 printf("No of iterations: %d \n \n", iteration);
388
389 memoryManager::deallocate_gpu(d_I);
390 memoryManager::deallocate_gpu(d_Iold);
391 #endif
392
393 memoryManager::deallocate(I);
394 memoryManager::deallocate(Iold);
395
396 return 0;
397 }
398
399 //
400 // Function for the analytic solution
401 //
solution(double x,double y)402 double solution(double x, double y)
403 {
404 return x * y * exp(x - y) * (1 - x) * (1 - y);
405 }
406
407 //
408 // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
409 //
computeErr(double * I,grid_s grid)410 void computeErr(double *I, grid_s grid)
411 {
412 RAJA::RangeSegment gridRange(0, grid.n);
413 RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
414
415 using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
416
417 RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange, gridRange), [=](RAJA::Index_type ty, RAJA::Index_type tx) {
418 int id = tx + grid.n * ty;
419 double x = grid.o + tx * grid.h;
420 double y = grid.o + ty * grid.h;
421 double myErr = std::abs(I[id] - solution(x, y));
422 tMax.max(myErr);
423 });
424
425 double l2err = tMax;
426 printf("Max error = %lg, h = %f \n", l2err, grid.h);
427 }
428
429 /*TEST
430
431 test:
432 requires: raja !cuda
433 filter: sed -e "/RAJA: OpenMP Policy/,+3d"
434
435 test:
436 suffix: 2
437 requires: raja cuda
438 filter: sed -e "/RAJA: OpenMP Policy/,+3d"
439
440 TEST*/
441