xref: /libCEED/backends/hip-shared/ceed-hip-shared-basis.c (revision f1c40530bc6f02bc6ca847af718b8b41cc3fd3ca)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed/ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <hip/hip_runtime.h>
12 #include <stddef.h>
13 #include "ceed-hip-shared.h"
14 #include "../hip/ceed-hip-common.h"
15 #include "../hip/ceed-hip-compile.h"
16 
17 //------------------------------------------------------------------------------
18 // Compute a block size based on required minimum threads
19 //------------------------------------------------------------------------------
20 static CeedInt ComputeBlockSizeFromRequirement(const CeedInt required) {
21   CeedInt maxSize = 1024;    // Max total threads per block
22   CeedInt currentSize = 64;  // Start with one group
23 
24   while(currentSize < maxSize) {
25     if (currentSize > required)
26       break;
27     else
28       currentSize = currentSize * 2;
29   }
30   return currentSize;
31 }
32 
33 //------------------------------------------------------------------------------
34 // Compute required thread block sizes for basis kernels given P, Q, dim, and
35 // num_comp (num_comp not currently used, but may be again in other basis
36 // parallelization options)
37 //------------------------------------------------------------------------------
38 static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d,
39                                         const CeedInt Q_1d,
40                                         const CeedInt num_comp, CeedInt *block_sizes) {
41 
42   // Note that this will use the same block sizes for all dimensions when compiling,
43   // but as each basis object is defined for a particular dimension, we will never
44   // call any kernels except the ones for the dimension for which we have computed the
45   // block sizes.
46   const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d);
47   switch (dim) {
48   case 1: {
49     // Interp kernels:
50     block_sizes[0] = 256;
51 
52     // Grad kernels:
53     block_sizes[1] = 256;
54 
55     // Weight kernels:
56     block_sizes[2] = 256;
57 
58   } break;
59   case 2: {
60     // Interp kernels:
61     CeedInt required = thread_1d * thread_1d;
62     block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
63 
64     // Grad kernels: currently use same required minimum threads
65     block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
66 
67     // Weight kernels:
68     required = CeedIntMax(64, Q_1d * Q_1d);
69     block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
70 
71   } break;
72   case 3: {
73     // Interp kernels:
74     CeedInt required = thread_1d * thread_1d;
75     block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
76 
77     // Grad kernels: currently use same required minimum threads
78     block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
79 
80     // Weight kernels:
81     required = Q_1d * Q_1d * Q_1d;
82     block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
83   }
84   }
85 
86   return CEED_ERROR_SUCCESS;
87 }
88 
89 //------------------------------------------------------------------------------
90 // Apply basis
91 //------------------------------------------------------------------------------
92 int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem,
93                                     CeedTransposeMode t_mode,
94                                     CeedEvalMode eval_mode, CeedVector u,
95                                     CeedVector v) {
96   int ierr;
97   Ceed ceed;
98   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
99   Ceed_Hip *ceed_Hip;
100   CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr);
101   CeedBasis_Hip_shared *data;
102   CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
103   CeedInt dim, num_comp;
104   ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr);
105   ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
106 
107   // Read vectors
108   const CeedScalar *d_u;
109   CeedScalar *d_v;
110   if (eval_mode != CEED_EVAL_WEIGHT) {
111     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr);
112   }
113   ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr);
114 
115   // Apply basis operation
116   switch (eval_mode) {
117   case CEED_EVAL_INTERP: {
118     CeedInt P_1d, Q_1d;
119     CeedInt block_size = data->block_sizes[0];
120     ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr);
121     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
122     CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
123     void *interp_args[] = {(void *) &num_elem, &data->d_interp_1d,
124                            &d_u, &d_v
125                           };
126     if (dim == 1) {
127       CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
128       elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
129       CeedInt grid = num_elem / elems_per_block +
130                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
131       CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar);
132       if (t_mode == CEED_TRANSPOSE) {
133         ierr = CeedRunKernelDimSharedHip(ceed, data->InterpTranspose, grid, thread_1d,
134                                          1,
135                                          elems_per_block, shared_mem,
136                                          interp_args); CeedChkBackend(ierr);
137       } else {
138         ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, 1,
139                                          elems_per_block, shared_mem,
140                                          interp_args); CeedChkBackend(ierr);
141       }
142     } else if (dim == 2) {
143       // Check if required threads is small enough to do multiple elems
144       const CeedInt elems_per_block = CeedIntMax(block_size /
145                                       (thread_1d*thread_1d), 1);
146       CeedInt grid = num_elem / elems_per_block +
147                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
148       CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof(
149                              CeedScalar);
150       if (t_mode == CEED_TRANSPOSE) {
151         ierr = CeedRunKernelDimSharedHip(ceed, data->InterpTranspose, grid, thread_1d,
152                                          thread_1d,
153                                          elems_per_block, shared_mem,
154                                          interp_args); CeedChkBackend(ierr);
155       } else {
156         ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d,
157                                          elems_per_block, shared_mem,
158                                          interp_args); CeedChkBackend(ierr);
159       }
160     } else if (dim == 3) {
161       const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d*thread_1d),
162                                       1);
163       CeedInt grid = num_elem / elems_per_block +
164                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
165       CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof(
166                              CeedScalar);
167       if (t_mode == CEED_TRANSPOSE) {
168         ierr = CeedRunKernelDimSharedHip(ceed, data->InterpTranspose, grid, thread_1d,
169                                          thread_1d,
170                                          elems_per_block, shared_mem,
171                                          interp_args); CeedChkBackend(ierr);
172       } else {
173         ierr = CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d,
174                                          elems_per_block, shared_mem,
175                                          interp_args); CeedChkBackend(ierr);
176       }
177     }
178   } break;
179   case CEED_EVAL_GRAD: {
180     CeedInt P_1d, Q_1d;
181     CeedInt block_size = data->block_sizes[1];
182     ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr);
183     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
184     CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
185     CeedScalar *d_grad_1d = data->d_grad_1d;
186     if (data->d_collo_grad_1d) {
187       d_grad_1d = data->d_collo_grad_1d;
188     }
189     void *grad_args[] = {(void *) &num_elem, &data->d_interp_1d,
190                          &d_grad_1d, &d_u, &d_v
191                         };
192     if (dim == 1) {
193       CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
194       elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
195       CeedInt grid = num_elem / elems_per_block +
196                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
197       CeedInt shared_mem = elems_per_block*thread_1d*sizeof(CeedScalar);
198       if (t_mode == CEED_TRANSPOSE) {
199         ierr = CeedRunKernelDimSharedHip(ceed, data->GradTranspose, grid, thread_1d, 1,
200                                          elems_per_block, shared_mem, grad_args);
201       } else {
202         ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, 1,
203                                          elems_per_block, shared_mem, grad_args);
204       }
205       CeedChkBackend(ierr);
206     } else if (dim == 2) {
207       // Check if required threads is small enough to do multiple elems
208       const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d*thread_1d),
209                                       1);
210       CeedInt grid = num_elem / elems_per_block +
211                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
212       CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof(
213                              CeedScalar);
214       if (t_mode == CEED_TRANSPOSE) {
215         ierr = CeedRunKernelDimSharedHip(ceed, data->GradTranspose, grid, thread_1d,
216                                          thread_1d,
217                                          elems_per_block, shared_mem,
218                                          grad_args); CeedChkBackend(ierr);
219       } else {
220         ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d,
221                                          elems_per_block, shared_mem,
222                                          grad_args); CeedChkBackend(ierr);
223       }
224     } else if (dim == 3) {
225       const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d*thread_1d),
226                                       1);
227       CeedInt grid = num_elem / elems_per_block +
228                      ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
229       CeedInt shared_mem = elems_per_block*thread_1d*thread_1d*sizeof(
230                              CeedScalar);
231       if (t_mode == CEED_TRANSPOSE) {
232         ierr = CeedRunKernelDimSharedHip(ceed, data->GradTranspose, grid, thread_1d,
233                                          thread_1d,
234                                          elems_per_block, shared_mem,
235                                          grad_args); CeedChkBackend(ierr);
236       } else {
237         ierr = CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d,
238                                          elems_per_block, shared_mem,
239                                          grad_args); CeedChkBackend(ierr);
240       }
241     }
242   } break;
243   case CEED_EVAL_WEIGHT: {
244     CeedInt Q_1d;
245     CeedInt block_size = data->block_sizes[2];
246     ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr);
247     void *weight_args[] = {(void *) &num_elem, (void *) &data->d_q_weight_1d, &d_v};
248     if (dim == 1) {
249       const CeedInt opt_elems = block_size / Q_1d;
250       const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
251       const CeedInt grid_size = num_elem / elems_per_block +
252                                 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
253       ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d,
254                                  elems_per_block, 1, weight_args);
255       CeedChkBackend(ierr);
256     } else if (dim == 2) {
257       const CeedInt opt_elems = block_size / (Q_1d * Q_1d);
258       const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
259       const CeedInt grid_size = num_elem / elems_per_block +
260                                 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
261       ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d,
262                                  elems_per_block, weight_args);
263       CeedChkBackend(ierr);
264     } else if (dim == 3) {
265       const CeedInt opt_elems = block_size / (Q_1d * Q_1d);
266       const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
267       const CeedInt grid_size = num_elem / elems_per_block +
268                                 ((num_elem / elems_per_block*elems_per_block < num_elem) ? 1 : 0 );
269       ierr = CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d,
270                                  elems_per_block, weight_args);
271       CeedChkBackend(ierr);
272     }
273   } break;
274   // LCOV_EXCL_START
275   // Evaluate the divergence to/from the quadrature points
276   case CEED_EVAL_DIV:
277     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
278   // Evaluate the curl to/from the quadrature points
279   case CEED_EVAL_CURL:
280     return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
281   // Take no action, BasisApply should not have been called
282   case CEED_EVAL_NONE:
283     return CeedError(ceed, CEED_ERROR_BACKEND,
284                      "CEED_EVAL_NONE does not make sense in this context");
285     // LCOV_EXCL_STOP
286   }
287 
288   // Restore vectors
289   if (eval_mode != CEED_EVAL_WEIGHT) {
290     ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr);
291   }
292   ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr);
293   return CEED_ERROR_SUCCESS;
294 }
295 
296 //------------------------------------------------------------------------------
297 // Destroy basis
298 //------------------------------------------------------------------------------
299 static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
300   int ierr;
301   Ceed ceed;
302   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
303 
304   CeedBasis_Hip_shared *data;
305   ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr);
306 
307   CeedChk_Hip(ceed, hipModuleUnload(data->module));
308 
309   ierr = hipFree(data->d_q_weight_1d); CeedChk_Hip(ceed, ierr);
310   ierr = hipFree(data->d_interp_1d); CeedChk_Hip(ceed, ierr);
311   ierr = hipFree(data->d_grad_1d); CeedChk_Hip(ceed, ierr);
312   ierr = hipFree(data->d_collo_grad_1d); CeedChk_Hip(ceed, ierr);
313   ierr = CeedFree(&data); CeedChkBackend(ierr);
314 
315   return CEED_ERROR_SUCCESS;
316 }
317 
318 //------------------------------------------------------------------------------
319 // Create tensor basis
320 //------------------------------------------------------------------------------
321 int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d,
322                                        const CeedScalar *interp_1d,
323                                        const CeedScalar *grad_1d,
324                                        const CeedScalar *q_ref1d,
325                                        const CeedScalar *q_weight_1d,
326                                        CeedBasis basis) {
327   int ierr;
328   Ceed ceed;
329   ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr);
330   CeedBasis_Hip_shared *data;
331   ierr = CeedCalloc(1, &data); CeedChkBackend(ierr);
332 
333   // Copy basis data to GPU
334   const CeedInt qBytes = Q_1d * sizeof(CeedScalar);
335   ierr = hipMalloc((void **)&data->d_q_weight_1d, qBytes);
336   CeedChk_Hip(ceed, ierr);
337   ierr = hipMemcpy(data->d_q_weight_1d, q_weight_1d, qBytes,
338                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
339 
340   const CeedInt iBytes = qBytes * P_1d;
341   ierr = hipMalloc((void **)&data->d_interp_1d, iBytes); CeedChk_Hip(ceed, ierr);
342   ierr = hipMemcpy(data->d_interp_1d, interp_1d, iBytes,
343                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
344 
345   ierr = hipMalloc((void **)&data->d_grad_1d, iBytes); CeedChk_Hip(ceed, ierr);
346   ierr = hipMemcpy(data->d_grad_1d, grad_1d, iBytes,
347                    hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
348 
349   // Compute collocated gradient and copy to GPU
350   data->d_collo_grad_1d = NULL;
351   bool has_collocated_grad = dim == 3 && Q_1d >= P_1d;
352   if (has_collocated_grad) {
353     CeedScalar *collo_grad_1d;
354     ierr = CeedMalloc(Q_1d*Q_1d, &collo_grad_1d); CeedChkBackend(ierr);
355     ierr = CeedBasisGetCollocatedGrad(basis, collo_grad_1d); CeedChkBackend(ierr);
356     ierr = hipMalloc((void **)&data->d_collo_grad_1d, qBytes * Q_1d);
357     CeedChk_Hip(ceed, ierr);
358     ierr = hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, qBytes * Q_1d,
359                      hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr);
360     ierr = CeedFree(&collo_grad_1d); CeedChkBackend(ierr);
361   }
362 
363   // Set number of threads per block for basis kernels
364   CeedInt num_comp;
365   ierr = CeedBasisGetNumComponents(basis, &num_comp); CeedChkBackend(ierr);
366   ierr = ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp,
367                                       data->block_sizes);
368   CeedChkBackend(ierr);
369 
370   // Compile basis kernels
371   char *basis_kernel_path, *basis_kernel_source;
372   ierr = CeedGetJitAbsolutePath(ceed,
373                                 "ceed/jit-source/hip/hip-shared-basis-tensor.h",
374                                 &basis_kernel_path); CeedChkBackend(ierr);
375   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
376   ierr = CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source);
377   CeedChkBackend(ierr);
378   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n");
379   ierr = CeedCompileHip(ceed, basis_kernel_source, &data->module, 11,
380                         "BASIS_Q_1D", Q_1d,
381                         "BASIS_P_1D", P_1d,
382                         "T_1D", CeedIntMax(Q_1d, P_1d),
383                         "BASIS_DIM", dim,
384                         "BASIS_NUM_COMP", num_comp,
385                         "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
386                         "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim),
387                         "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0],
388                         "BASIS_GRAD_BLOCK_SIZE", data->block_sizes[1],
389                         "BASIS_WEIGHT_BLOCK_SIZE", data->block_sizes[2],
390                         "BASIS_HAS_COLLOCATED_GRAD", has_collocated_grad
391                        ); CeedChkBackend(ierr);
392   ierr = CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp);
393   CeedChkBackend(ierr);
394   ierr = CeedGetKernelHip(ceed, data->module, "InterpTranspose",
395                           &data->InterpTranspose);
396   CeedChkBackend(ierr);
397   ierr = CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad);
398   CeedChkBackend(ierr);
399   ierr = CeedGetKernelHip(ceed, data->module, "GradTranspose",
400                           &data->GradTranspose);
401   CeedChkBackend(ierr);
402   ierr = CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight);
403   CeedChkBackend(ierr);
404   ierr = CeedFree(&basis_kernel_path); CeedChkBackend(ierr);
405   ierr = CeedFree(&basis_kernel_source); CeedChkBackend(ierr);
406 
407   ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr);
408 
409   // Register backend functions
410   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
411                                 CeedBasisApplyTensor_Hip_shared);
412   CeedChkBackend(ierr);
413   ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
414                                 CeedBasisDestroy_Hip_shared); CeedChkBackend(ierr);
415   return CEED_ERROR_SUCCESS;
416 }
417 //------------------------------------------------------------------------------
418