xref: /libCEED/backends/hip-shared/ceed-hip-shared-basis.c (revision 9e511c80210b4447ceabc14bc1a44d55934e0fbf)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <string.h>
14 #include <hip/hip_runtime.h>
15 
16 #include "../hip/ceed-hip-common.h"
17 #include "../hip/ceed-hip-compile.h"
18 #include "ceed-hip-shared.h"
19 
20 //------------------------------------------------------------------------------
21 // Compute a block size based on required minimum threads
22 //------------------------------------------------------------------------------
23 static CeedInt ComputeBlockSizeFromRequirement(const CeedInt required) {
24   CeedInt maxSize     = 1024;  // Max total threads per block
25   CeedInt currentSize = 64;    // Start with one group
26 
27   while (currentSize < maxSize) {
28     if (currentSize > required) break;
29     else currentSize = currentSize * 2;
30   }
31   return currentSize;
32 }
33 
34 //------------------------------------------------------------------------------
35 // Compute required thread block sizes for basis kernels given P, Q, dim, and
36 // num_comp (num_comp not currently used, but may be again in other basis
37 // parallelization options)
38 //------------------------------------------------------------------------------
39 static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, const CeedInt num_comp, CeedInt *block_sizes) {
40   // Note that this will use the same block sizes for all dimensions when compiling,
41   // but as each basis object is defined for a particular dimension, we will never
42   // call any kernels except the ones for the dimension for which we have computed the
43   // block sizes.
44   const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d);
45 
46   switch (dim) {
47     case 1: {
48       // Interp kernels:
49       block_sizes[0] = 256;
50 
51       // Grad kernels:
52       block_sizes[1] = 256;
53 
54       // Weight kernels:
55       block_sizes[2] = 256;
56     } break;
57     case 2: {
58       // Interp kernels:
59       CeedInt required = thread_1d * thread_1d;
60 
61       block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
62 
63       // Grad kernels: currently use same required minimum threads
64       block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
65 
66       // Weight kernels:
67       required       = CeedIntMax(64, Q_1d * Q_1d);
68       block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
69 
70     } break;
71     case 3: {
72       // Interp kernels:
73       CeedInt required = thread_1d * thread_1d;
74 
75       block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
76 
77       // Grad kernels: currently use same required minimum threads
78       block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
79 
80       // Weight kernels:
81       required       = Q_1d * Q_1d * Q_1d;
82       block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
83     }
84   }
85   return CEED_ERROR_SUCCESS;
86 }
87 
88 //------------------------------------------------------------------------------
89 // Apply basis
90 //------------------------------------------------------------------------------
91 static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
92                                                CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
93   Ceed                  ceed;
94   Ceed_Hip             *ceed_Hip;
95   CeedInt               dim, num_comp;
96   const CeedScalar     *d_u;
97   CeedScalar           *d_v;
98   CeedBasis_Hip_shared *data;
99 
100   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
101   CeedCallBackend(CeedGetData(ceed, &ceed_Hip));
102   CeedCallBackend(CeedBasisGetData(basis, &data));
103   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
104   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
105 
106   // Get read/write access to u, v
107   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
108   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
109   if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
110   else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
111 
112   // Apply basis operation
113   switch (eval_mode) {
114     case CEED_EVAL_INTERP: {
115       CeedInt P_1d, Q_1d;
116       CeedInt block_size = data->block_sizes[0];
117 
118       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
119       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
120       CeedInt thread_1d     = CeedIntMax(Q_1d, P_1d);
121       void   *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};
122 
123       if (dim == 1) {
124         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
125         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
126         CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
127         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
128 
129         if (t_mode == CEED_TRANSPOSE) {
130           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1,
131                                                      elems_per_block, shared_mem, interp_args));
132         } else {
133           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
134         }
135       } else if (dim == 2) {
136         // Check if required threads is small enough to do multiple elems
137         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
138         CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
139         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
140 
141         if (t_mode == CEED_TRANSPOSE) {
142           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
143                                                      elems_per_block, shared_mem, interp_args));
144         } else {
145           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
146         }
147       } else if (dim == 3) {
148         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
149         CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
150         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
151 
152         if (t_mode == CEED_TRANSPOSE) {
153           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
154                                                      elems_per_block, shared_mem, interp_args));
155         } else {
156           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
157         }
158       }
159     } break;
160     case CEED_EVAL_GRAD: {
161       CeedInt P_1d, Q_1d;
162       CeedInt block_size = data->block_sizes[1];
163 
164       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
165       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
166       CeedInt     thread_1d = CeedIntMax(Q_1d, P_1d);
167       CeedScalar *d_grad_1d = data->d_grad_1d;
168 
169       if (data->d_collo_grad_1d) {
170         d_grad_1d = data->d_collo_grad_1d;
171       }
172       void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v};
173       if (dim == 1) {
174         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
175         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
176         CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
177         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
178 
179         if (t_mode == CEED_TRANSPOSE) {
180           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1,
181                                                      elems_per_block, shared_mem, grad_args));
182         } else {
183           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
184         }
185       } else if (dim == 2) {
186         // Check if required threads is small enough to do multiple elems
187         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
188         CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
189         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
190 
191         if (t_mode == CEED_TRANSPOSE) {
192           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
193                                                      elems_per_block, shared_mem, grad_args));
194         } else {
195           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
196         }
197       } else if (dim == 3) {
198         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
199         CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
200         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
201 
202         if (t_mode == CEED_TRANSPOSE) {
203           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
204                                                      elems_per_block, shared_mem, grad_args));
205         } else {
206           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
207         }
208       }
209     } break;
210     case CEED_EVAL_WEIGHT: {
211       CeedInt Q_1d;
212       CeedInt block_size = data->block_sizes[2];
213 
214       CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
215       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
216       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
217 
218       if (dim == 1) {
219         const CeedInt opt_elems       = block_size / Q_1d;
220         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
221         const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
222 
223         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args));
224       } else if (dim == 2) {
225         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
226         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
227         const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
228 
229         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
230       } else if (dim == 3) {
231         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
232         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
233         const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
234 
235         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
236       }
237     } break;
238     case CEED_EVAL_NONE: /* handled separately below */
239       break;
240     // LCOV_EXCL_START
241     case CEED_EVAL_DIV:
242     case CEED_EVAL_CURL:
243       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
244       // LCOV_EXCL_STOP
245   }
246 
247   // Restore vectors, cover CEED_EVAL_NONE
248   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
249   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
250   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
251   return CEED_ERROR_SUCCESS;
252 }
253 
254 int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
255                                     CeedVector v) {
256   CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
257   return CEED_ERROR_SUCCESS;
258 }
259 
260 int CeedBasisApplyAddTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
261                                        CeedVector v) {
262   CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
263   return CEED_ERROR_SUCCESS;
264 }
265 
266 //------------------------------------------------------------------------------
267 // Basis apply - tensor AtPoints
268 //------------------------------------------------------------------------------
269 static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
270                                                  CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
271   Ceed                  ceed;
272   CeedInt               Q_1d, dim, max_num_points = num_points[0];
273   const CeedInt         is_transpose   = t_mode == CEED_TRANSPOSE;
274   const int             max_block_size = 32;
275   const CeedScalar     *d_x, *d_u;
276   CeedScalar           *d_v;
277   CeedBasis_Hip_shared *data;
278 
279   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
280   CeedCallBackend(CeedBasisGetData(basis, &data));
281   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
282   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
283 
284   // Weight handled separately
285   if (eval_mode == CEED_EVAL_WEIGHT) {
286     CeedCallBackend(CeedVectorSetValue(v, 1.0));
287     return CEED_ERROR_SUCCESS;
288   }
289 
290   // Check padded to uniform number of points per elem
291   for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
292   {
293     CeedInt  num_comp, q_comp;
294     CeedSize len, len_required;
295 
296     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
297     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
298     CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
299     len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
300     CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
301               "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
302               " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
303               len, len_required);
304   }
305 
306   // Move num_points array to device
307   if (is_transpose) {
308     const CeedInt num_bytes = num_elem * sizeof(CeedInt);
309 
310     if (num_elem != data->num_elem_at_points) {
311       data->num_elem_at_points = num_elem;
312 
313       if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
314       CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes));
315       CeedCallBackend(CeedFree(&data->h_points_per_elem));
316       CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
317     }
318     if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
319       memcpy(data->h_points_per_elem, num_points, num_bytes);
320       CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
321     }
322   }
323 
324   // Build kernels if needed
325   if (data->num_points != max_num_points) {
326     CeedInt P_1d;
327 
328     CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
329     data->num_points = max_num_points;
330 
331     // -- Create interp matrix to Chebyshev coefficients
332     if (!data->d_chebyshev_interp_1d) {
333       CeedSize    interp_bytes;
334       CeedScalar *chebyshev_interp_1d;
335 
336       interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
337       CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
338       CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
339       CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
340       CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
341       CeedCallBackend(CeedFree(&chebyshev_interp_1d));
342     }
343 
344     // -- Compile kernels
345     char       *basis_kernel_source;
346     const char *basis_kernel_path;
347     CeedInt     num_comp;
348 
349     if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
350     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
351     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h", &basis_kernel_path));
352     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
353     CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
354     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
355     CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
356                                     Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
357                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
358                                     max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
359     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
360     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
361     CeedCallBackend(CeedFree(&basis_kernel_path));
362     CeedCallBackend(CeedFree(&basis_kernel_source));
363   }
364 
365   // Get read/write access to u, v
366   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
367   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
368   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
369   if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
370   else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
371 
372   // Clear v for transpose operation
373   if (is_transpose && !apply_add) {
374     CeedSize length;
375 
376     CeedCallBackend(CeedVectorGetLength(v, &length));
377     CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
378   }
379 
380   // Basis action
381   switch (eval_mode) {
382     case CEED_EVAL_INTERP: {
383       void *interp_args[]      = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
384       const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
385 
386       CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
387     } break;
388     case CEED_EVAL_GRAD: {
389       void *grad_args[]        = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
390       const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
391 
392       CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
393     } break;
394     case CEED_EVAL_WEIGHT:
395     case CEED_EVAL_NONE: /* handled separately below */
396       break;
397     // LCOV_EXCL_START
398     case CEED_EVAL_DIV:
399     case CEED_EVAL_CURL:
400       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
401       // LCOV_EXCL_STOP
402   }
403 
404   // Restore vectors, cover CEED_EVAL_NONE
405   CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
406   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
407   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
408   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
409   return CEED_ERROR_SUCCESS;
410 }
411 
412 static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
413                                              CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
414   CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
415   return CEED_ERROR_SUCCESS;
416 }
417 
418 static int CeedBasisApplyAddAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
419                                                 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
420   CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
421   return CEED_ERROR_SUCCESS;
422 }
423 
424 //------------------------------------------------------------------------------
425 // Destroy basis
426 //------------------------------------------------------------------------------
427 static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
428   Ceed                  ceed;
429   CeedBasis_Hip_shared *data;
430 
431   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
432   CeedCallBackend(CeedBasisGetData(basis, &data));
433   CeedCallHip(ceed, hipModuleUnload(data->module));
434   if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
435   if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
436   CeedCallBackend(CeedFree(&data->h_points_per_elem));
437   if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
438   CeedCallHip(ceed, hipFree(data->d_interp_1d));
439   CeedCallHip(ceed, hipFree(data->d_grad_1d));
440   CeedCallHip(ceed, hipFree(data->d_collo_grad_1d));
441   CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
442   CeedCallBackend(CeedFree(&data));
443   return CEED_ERROR_SUCCESS;
444 }
445 
446 //------------------------------------------------------------------------------
447 // Create tensor basis
448 //------------------------------------------------------------------------------
449 int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
450                                        const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
451   Ceed                  ceed;
452   char                 *basis_kernel_source;
453   const char           *basis_kernel_path;
454   CeedInt               num_comp;
455   const CeedInt         q_bytes      = Q_1d * sizeof(CeedScalar);
456   const CeedInt         interp_bytes = q_bytes * P_1d;
457   CeedBasis_Hip_shared *data;
458 
459   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
460   CeedCallBackend(CeedCalloc(1, &data));
461 
462   // Copy basis data to GPU
463   if (q_weight_1d) {
464     CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
465     CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice));
466   }
467   CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes));
468   CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice));
469   CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes));
470   CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice));
471 
472   // Compute collocated gradient and copy to GPU
473   data->d_collo_grad_1d    = NULL;
474   bool has_collocated_grad = dim == 3 && Q_1d >= P_1d;
475 
476   if (has_collocated_grad) {
477     CeedScalar *collo_grad_1d;
478 
479     CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d));
480     CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d));
481     CeedCallHip(ceed, hipMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d));
482     CeedCallHip(ceed, hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, hipMemcpyHostToDevice));
483     CeedCallBackend(CeedFree(&collo_grad_1d));
484   }
485 
486   // Set number of threads per block for basis kernels
487   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
488   CeedCallBackend(ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, data->block_sizes));
489 
490   // Compile basis kernels
491   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-shared-basis-tensor.h", &basis_kernel_path));
492   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
493   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
494   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
495   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
496                                   CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
497                                   "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0], "BASIS_GRAD_BLOCK_SIZE",
498                                   data->block_sizes[1], "BASIS_WEIGHT_BLOCK_SIZE", data->block_sizes[2], "BASIS_HAS_COLLOCATED_GRAD",
499                                   has_collocated_grad));
500   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
501   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
502   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
503   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
504   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose));
505   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
506   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
507   CeedCallBackend(CeedFree(&basis_kernel_path));
508   CeedCallBackend(CeedFree(&basis_kernel_source));
509 
510   CeedCallBackend(CeedBasisSetData(basis, data));
511 
512   // Register backend functions
513   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared));
514   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Hip_shared));
515   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip_shared));
516   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip_shared));
517   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared));
518   return CEED_ERROR_SUCCESS;
519 }
520 
521 //------------------------------------------------------------------------------
522