xref: /libCEED/backends/hip-shared/ceed-hip-shared-basis.c (revision 4febb80102c5c87c6a273a85e30d4673d8552be6)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <hip/hip_runtime.h>
12 #include <stdbool.h>
13 #include <stddef.h>
14 
15 #include "../hip/ceed-hip-common.h"
16 #include "../hip/ceed-hip-compile.h"
17 #include "ceed-hip-shared.h"
18 
19 //------------------------------------------------------------------------------
20 // Compute a block size based on required minimum threads
21 //------------------------------------------------------------------------------
22 static CeedInt ComputeBlockSizeFromRequirement(const CeedInt required) {
23   CeedInt maxSize     = 1024;  // Max total threads per block
24   CeedInt currentSize = 64;    // Start with one group
25 
26   while (currentSize < maxSize) {
27     if (currentSize > required) break;
28     else currentSize = currentSize * 2;
29   }
30   return currentSize;
31 }
32 
33 //------------------------------------------------------------------------------
34 // Compute required thread block sizes for basis kernels given P, Q, dim, and
35 // num_comp (num_comp not currently used, but may be again in other basis
36 // parallelization options)
37 //------------------------------------------------------------------------------
38 static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, const CeedInt num_comp, CeedInt *block_sizes) {
39   // Note that this will use the same block sizes for all dimensions when compiling,
40   // but as each basis object is defined for a particular dimension, we will never
41   // call any kernels except the ones for the dimension for which we have computed the
42   // block sizes.
43   const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d);
44   switch (dim) {
45     case 1: {
46       // Interp kernels:
47       block_sizes[0] = 256;
48 
49       // Grad kernels:
50       block_sizes[1] = 256;
51 
52       // Weight kernels:
53       block_sizes[2] = 256;
54     } break;
55     case 2: {
56       // Interp kernels:
57       CeedInt required = thread_1d * thread_1d;
58       block_sizes[0]   = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
59 
60       // Grad kernels: currently use same required minimum threads
61       block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
62 
63       // Weight kernels:
64       required       = CeedIntMax(64, Q_1d * Q_1d);
65       block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
66 
67     } break;
68     case 3: {
69       // Interp kernels:
70       CeedInt required = thread_1d * thread_1d;
71       block_sizes[0]   = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
72 
73       // Grad kernels: currently use same required minimum threads
74       block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
75 
76       // Weight kernels:
77       required       = Q_1d * Q_1d * Q_1d;
78       block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
79     }
80   }
81 
82   return CEED_ERROR_SUCCESS;
83 }
84 
85 //------------------------------------------------------------------------------
86 // Apply basis
87 //------------------------------------------------------------------------------
88 int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
89                                     CeedVector v) {
90   Ceed ceed;
91   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
92   Ceed_Hip *ceed_Hip;
93   CeedCallBackend(CeedGetData(ceed, &ceed_Hip));
94   CeedBasis_Hip_shared *data;
95   CeedCallBackend(CeedBasisGetData(basis, &data));
96   CeedInt dim, num_comp;
97   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
98   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
99 
100   // Read vectors
101   const CeedScalar *d_u;
102   CeedScalar       *d_v;
103   if (eval_mode != CEED_EVAL_WEIGHT) {
104     CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
105   }
106   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
107 
108   // Apply basis operation
109   switch (eval_mode) {
110     case CEED_EVAL_INTERP: {
111       CeedInt P_1d, Q_1d;
112       CeedInt block_size = data->block_sizes[0];
113       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
114       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
115       CeedInt thread_1d     = CeedIntMax(Q_1d, P_1d);
116       void   *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};
117       if (dim == 1) {
118         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
119         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
120         CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
121         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
122         if (t_mode == CEED_TRANSPOSE) {
123           CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->InterpTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
124         } else {
125           CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
126         }
127       } else if (dim == 2) {
128         // Check if required threads is small enough to do multiple elems
129         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
130         CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
131         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
132         if (t_mode == CEED_TRANSPOSE) {
133           CeedCallBackend(
134               CeedRunKernelDimSharedHip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
135         } else {
136           CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
137         }
138       } else if (dim == 3) {
139         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
140         CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
141         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
142         if (t_mode == CEED_TRANSPOSE) {
143           CeedCallBackend(
144               CeedRunKernelDimSharedHip(ceed, data->InterpTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
145         } else {
146           CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
147         }
148       }
149     } break;
150     case CEED_EVAL_GRAD: {
151       CeedInt P_1d, Q_1d;
152       CeedInt block_size = data->block_sizes[1];
153       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
154       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
155       CeedInt     thread_1d = CeedIntMax(Q_1d, P_1d);
156       CeedScalar *d_grad_1d = data->d_grad_1d;
157       if (data->d_collo_grad_1d) {
158         d_grad_1d = data->d_collo_grad_1d;
159       }
160       void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v};
161       if (dim == 1) {
162         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
163         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
164         CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
165         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
166         if (t_mode == CEED_TRANSPOSE) {
167           CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->GradTranspose, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
168         } else {
169           CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
170         }
171       } else if (dim == 2) {
172         // Check if required threads is small enough to do multiple elems
173         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
174         CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
175         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
176         if (t_mode == CEED_TRANSPOSE) {
177           CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
178         } else {
179           CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
180         }
181       } else if (dim == 3) {
182         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
183         CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
184         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
185         if (t_mode == CEED_TRANSPOSE) {
186           CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->GradTranspose, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
187         } else {
188           CeedCallBackend(CeedRunKernelDimSharedHip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
189         }
190       }
191     } break;
192     case CEED_EVAL_WEIGHT: {
193       CeedInt Q_1d;
194       CeedInt block_size = data->block_sizes[2];
195       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
196       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
197       if (dim == 1) {
198         const CeedInt opt_elems       = block_size / Q_1d;
199         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
200         const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
201         CeedCallBackend(CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args));
202       } else if (dim == 2) {
203         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
204         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
205         const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
206         CeedCallBackend(CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
207       } else if (dim == 3) {
208         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
209         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
210         const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
211         CeedCallBackend(CeedRunKernelDimHip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
212       }
213     } break;
214     // LCOV_EXCL_START
215     // Evaluate the divergence to/from the quadrature points
216     case CEED_EVAL_DIV:
217       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported");
218     // Evaluate the curl to/from the quadrature points
219     case CEED_EVAL_CURL:
220       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported");
221     // Take no action, BasisApply should not have been called
222     case CEED_EVAL_NONE:
223       return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_NONE does not make sense in this context");
224       // LCOV_EXCL_STOP
225   }
226 
227   // Restore vectors
228   if (eval_mode != CEED_EVAL_WEIGHT) {
229     CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
230   }
231   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
232   return CEED_ERROR_SUCCESS;
233 }
234 
235 //------------------------------------------------------------------------------
236 // Destroy basis
237 //------------------------------------------------------------------------------
238 static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
239   Ceed ceed;
240   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
241 
242   CeedBasis_Hip_shared *data;
243   CeedCallBackend(CeedBasisGetData(basis, &data));
244 
245   CeedCallHip(ceed, hipModuleUnload(data->module));
246 
247   CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
248   CeedCallHip(ceed, hipFree(data->d_interp_1d));
249   CeedCallHip(ceed, hipFree(data->d_grad_1d));
250   CeedCallHip(ceed, hipFree(data->d_collo_grad_1d));
251   CeedCallBackend(CeedFree(&data));
252 
253   return CEED_ERROR_SUCCESS;
254 }
255 
256 //------------------------------------------------------------------------------
257 // Create tensor basis
258 //------------------------------------------------------------------------------
259 int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
260                                        const CeedScalar *q_ref1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
261   Ceed ceed;
262   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
263   CeedBasis_Hip_shared *data;
264   CeedCallBackend(CeedCalloc(1, &data));
265 
266   // Copy basis data to GPU
267   const CeedInt qBytes = Q_1d * sizeof(CeedScalar);
268   CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, qBytes));
269   CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, qBytes, hipMemcpyHostToDevice));
270 
271   const CeedInt iBytes = qBytes * P_1d;
272   CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, iBytes));
273   CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, iBytes, hipMemcpyHostToDevice));
274 
275   CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, iBytes));
276   CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, iBytes, hipMemcpyHostToDevice));
277 
278   // Compute collocated gradient and copy to GPU
279   data->d_collo_grad_1d    = NULL;
280   bool has_collocated_grad = dim == 3 && Q_1d >= P_1d;
281   if (has_collocated_grad) {
282     CeedScalar *collo_grad_1d;
283     CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d));
284     CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d));
285     CeedCallHip(ceed, hipMalloc((void **)&data->d_collo_grad_1d, qBytes * Q_1d));
286     CeedCallHip(ceed, hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, qBytes * Q_1d, hipMemcpyHostToDevice));
287     CeedCallBackend(CeedFree(&collo_grad_1d));
288   }
289 
290   // Set number of threads per block for basis kernels
291   CeedInt num_comp;
292   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
293   CeedCallBackend(ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, data->block_sizes));
294 
295   // Compile basis kernels
296   char *basis_kernel_path, *basis_kernel_source;
297   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor.h", &basis_kernel_path));
298   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source -----\n");
299   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
300   CeedDebug256(ceed, 2, "----- Loading Basis Kernel Source Complete! -----\n");
301   CeedCallBackend(CeedCompileHip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", CeedIntMax(Q_1d, P_1d),
302                                  "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS",
303                                  CeedIntPow(Q_1d, dim), "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0], "BASIS_GRAD_BLOCK_SIZE",
304                                  data->block_sizes[1], "BASIS_WEIGHT_BLOCK_SIZE", data->block_sizes[2], "BASIS_HAS_COLLOCATED_GRAD",
305                                  has_collocated_grad));
306   CeedCallBackend(CeedGetKernelHip(ceed, data->module, "Interp", &data->Interp));
307   CeedCallBackend(CeedGetKernelHip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
308   CeedCallBackend(CeedGetKernelHip(ceed, data->module, "Grad", &data->Grad));
309   CeedCallBackend(CeedGetKernelHip(ceed, data->module, "GradTranspose", &data->GradTranspose));
310   CeedCallBackend(CeedGetKernelHip(ceed, data->module, "Weight", &data->Weight));
311   CeedCallBackend(CeedFree(&basis_kernel_path));
312   CeedCallBackend(CeedFree(&basis_kernel_source));
313 
314   CeedCallBackend(CeedBasisSetData(basis, data));
315 
316   // Register backend functions
317   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared));
318   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared));
319   return CEED_ERROR_SUCCESS;
320 }
321 //------------------------------------------------------------------------------
322