xref: /libCEED/backends/hip-shared/ceed-hip-shared-basis.c (revision dc007f05648c670dfdc3e42fab8d6c1219c0afbb)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <stdbool.h>
12 #include <stddef.h>
13 #include <string.h>
14 #include <hip/hip_runtime.h>
15 
16 #include "../hip/ceed-hip-common.h"
17 #include "../hip/ceed-hip-compile.h"
18 #include "ceed-hip-shared.h"
19 
20 //------------------------------------------------------------------------------
21 // Compute a block size based on required minimum threads
22 //------------------------------------------------------------------------------
23 static CeedInt ComputeBlockSizeFromRequirement(const CeedInt required) {
24   CeedInt maxSize     = 1024;  // Max total threads per block
25   CeedInt currentSize = 64;    // Start with one group
26 
27   while (currentSize < maxSize) {
28     if (currentSize > required) break;
29     else currentSize = currentSize * 2;
30   }
31   return currentSize;
32 }
33 
34 //------------------------------------------------------------------------------
35 // Compute required thread block sizes for basis kernels given P, Q, dim, and
36 // num_comp (num_comp not currently used, but may be again in other basis
37 // parallelization options)
38 //------------------------------------------------------------------------------
39 static int ComputeBasisThreadBlockSizes(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, const CeedInt num_comp, CeedInt *block_sizes) {
40   // Note that this will use the same block sizes for all dimensions when compiling,
41   // but as each basis object is defined for a particular dimension, we will never
42   // call any kernels except the ones for the dimension for which we have computed the
43   // block sizes.
44   const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d);
45 
46   switch (dim) {
47     case 1: {
48       // Interp kernels:
49       block_sizes[0] = 256;
50 
51       // Grad kernels:
52       block_sizes[1] = 256;
53 
54       // Weight kernels:
55       block_sizes[2] = 256;
56     } break;
57     case 2: {
58       // Interp kernels:
59       CeedInt required = thread_1d * thread_1d;
60 
61       block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
62 
63       // Grad kernels: currently use same required minimum threads
64       block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
65 
66       // Weight kernels:
67       required       = CeedIntMax(64, Q_1d * Q_1d);
68       block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
69 
70     } break;
71     case 3: {
72       // Interp kernels:
73       CeedInt required = thread_1d * thread_1d;
74 
75       block_sizes[0] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
76 
77       // Grad kernels: currently use same required minimum threads
78       block_sizes[1] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
79 
80       // Weight kernels:
81       required       = Q_1d * Q_1d * Q_1d;
82       block_sizes[2] = CeedIntMax(256, ComputeBlockSizeFromRequirement(required));
83     }
84   }
85   return CEED_ERROR_SUCCESS;
86 }
87 
88 //------------------------------------------------------------------------------
89 // Apply basis
90 //------------------------------------------------------------------------------
91 static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
92                                                CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
93   Ceed                  ceed;
94   Ceed_Hip             *ceed_Hip;
95   CeedInt               dim, num_comp;
96   const CeedScalar     *d_u;
97   CeedScalar           *d_v;
98   CeedBasis_Hip_shared *data;
99 
100   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
101   CeedCallBackend(CeedGetData(ceed, &ceed_Hip));
102   CeedCallBackend(CeedBasisGetData(basis, &data));
103   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
104   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
105 
106   // Get read/write access to u, v
107   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
108   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
109   if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
110   else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
111 
112   // Apply basis operation
113   switch (eval_mode) {
114     case CEED_EVAL_INTERP: {
115       CeedInt P_1d, Q_1d;
116       CeedInt block_size = data->block_sizes[0];
117 
118       CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp_1d not set", CeedEvalModes[eval_mode]);
119       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
120       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
121       CeedInt thread_1d     = CeedIntMax(Q_1d, P_1d);
122       void   *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};
123 
124       if (dim == 1) {
125         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
126         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
127         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
128         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
129 
130         if (t_mode == CEED_TRANSPOSE) {
131           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, 1,
132                                                      elems_per_block, shared_mem, interp_args));
133         } else {
134           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, 1, elems_per_block, shared_mem, interp_args));
135         }
136       } else if (dim == 2) {
137         // Check if required threads is small enough to do multiple elems
138         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
139         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
140         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
141 
142         if (t_mode == CEED_TRANSPOSE) {
143           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
144                                                      elems_per_block, shared_mem, interp_args));
145         } else {
146           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
147         }
148       } else if (dim == 3) {
149         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
150         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
151         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
152 
153         if (t_mode == CEED_TRANSPOSE) {
154           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread_1d, thread_1d,
155                                                      elems_per_block, shared_mem, interp_args));
156         } else {
157           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));
158         }
159       }
160     } break;
161     case CEED_EVAL_GRAD: {
162       CeedInt P_1d, Q_1d;
163       CeedInt block_size = data->block_sizes[1];
164 
165       CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad_1d not set", CeedEvalModes[eval_mode]);
166       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
167       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
168       CeedInt     thread_1d = CeedIntMax(Q_1d, P_1d);
169       CeedScalar *d_grad_1d = data->d_grad_1d;
170 
171       if (data->d_collo_grad_1d) {
172         d_grad_1d = data->d_collo_grad_1d;
173       }
174       void *grad_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_grad_1d, &d_u, &d_v};
175 
176       if (dim == 1) {
177         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
178         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
179         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
180         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
181 
182         if (t_mode == CEED_TRANSPOSE) {
183           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, 1,
184                                                      elems_per_block, shared_mem, grad_args));
185         } else {
186           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, 1, elems_per_block, shared_mem, grad_args));
187         }
188       } else if (dim == 2) {
189         // Check if required threads is small enough to do multiple elems
190         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
191         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
192         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
193 
194         if (t_mode == CEED_TRANSPOSE) {
195           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
196                                                      elems_per_block, shared_mem, grad_args));
197         } else {
198           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
199         }
200       } else if (dim == 3) {
201         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
202         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
203         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
204 
205         if (t_mode == CEED_TRANSPOSE) {
206           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread_1d, thread_1d,
207                                                      elems_per_block, shared_mem, grad_args));
208         } else {
209           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));
210         }
211       }
212     } break;
213     case CEED_EVAL_WEIGHT: {
214       CeedInt Q_1d;
215       CeedInt block_size = data->block_sizes[2];
216 
217       CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
218       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
219       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
220 
221       if (dim == 1) {
222         const CeedInt opt_elems       = block_size / Q_1d;
223         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
224         const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
225 
226         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args));
227       } else if (dim == 2) {
228         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
229         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
230         const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
231 
232         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
233       } else if (dim == 3) {
234         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
235         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
236         const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
237 
238         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
239       }
240     } break;
241     case CEED_EVAL_NONE: /* handled separately below */
242       break;
243     // LCOV_EXCL_START
244     case CEED_EVAL_DIV:
245     case CEED_EVAL_CURL:
246       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
247       // LCOV_EXCL_STOP
248   }
249 
250   // Restore vectors, cover CEED_EVAL_NONE
251   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
252   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
253   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
254   CeedCallBackend(CeedDestroy(&ceed));
255   return CEED_ERROR_SUCCESS;
256 }
257 
258 int CeedBasisApplyTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
259                                     CeedVector v) {
260   CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
261   return CEED_ERROR_SUCCESS;
262 }
263 
264 int CeedBasisApplyAddTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
265                                        CeedVector v) {
266   CeedCallBackend(CeedBasisApplyTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
267   return CEED_ERROR_SUCCESS;
268 }
269 
270 //------------------------------------------------------------------------------
271 // Basis apply - tensor AtPoints
272 //------------------------------------------------------------------------------
273 static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
274                                                  CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
275   Ceed                  ceed;
276   CeedInt               Q_1d, dim, max_num_points = num_points[0];
277   const CeedInt         is_transpose = t_mode == CEED_TRANSPOSE;
278   const CeedScalar     *d_x, *d_u;
279   CeedScalar           *d_v;
280   CeedBasis_Hip_shared *data;
281 
282   CeedCallBackend(CeedBasisGetData(basis, &data));
283   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
284   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
285 
286   // Weight handled separately
287   if (eval_mode == CEED_EVAL_WEIGHT) {
288     CeedCallBackend(CeedVectorSetValue(v, 1.0));
289     return CEED_ERROR_SUCCESS;
290   }
291 
292   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
293 
294   // Check padded to uniform number of points per elem
295   for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
296   {
297     CeedInt  num_comp, q_comp;
298     CeedSize len, len_required;
299 
300     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
301     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
302     CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
303     len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
304     CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
305               "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
306               " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
307               len, len_required);
308   }
309 
310   // Move num_points array to device
311   if (is_transpose) {
312     const CeedInt num_bytes = num_elem * sizeof(CeedInt);
313 
314     if (num_elem != data->num_elem_at_points) {
315       data->num_elem_at_points = num_elem;
316 
317       if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
318       CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes));
319       CeedCallBackend(CeedFree(&data->h_points_per_elem));
320       CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
321     }
322     if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
323       memcpy(data->h_points_per_elem, num_points, num_bytes);
324       CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
325     }
326   }
327 
328   // Build kernels if needed
329   if (data->num_points != max_num_points) {
330     CeedInt P_1d;
331 
332     CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
333     data->num_points = max_num_points;
334 
335     // -- Create interp matrix to Chebyshev coefficients
336     if (!data->d_chebyshev_interp_1d) {
337       CeedSize    interp_bytes;
338       CeedScalar *chebyshev_interp_1d;
339 
340       interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
341       CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
342       CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
343       CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
344       CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
345       CeedCallBackend(CeedFree(&chebyshev_interp_1d));
346     }
347 
348     // -- Compile kernels
349     const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h>\n";
350     CeedInt    num_comp;
351 
352     if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
353     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
354     CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
355                                     CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
356                                     "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points, "BASIS_INTERP_BLOCK_SIZE",
357                                     data->block_sizes[0]));
358     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
359     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
360     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
361     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
362   }
363 
364   // Get read/write access to u, v
365   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
366   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
367   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
368   if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
369   else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
370 
371   // Clear v for transpose operation
372   if (is_transpose && !apply_add) {
373     CeedInt  num_comp, q_comp, num_nodes;
374     CeedSize length;
375 
376     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
377     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
378     CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
379     length =
380         (CeedSize)num_elem * (CeedSize)num_comp * (t_mode == CEED_TRANSPOSE ? (CeedSize)num_nodes : ((CeedSize)max_num_points * (CeedSize)q_comp));
381     CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
382   }
383 
384   // Basis action
385   switch (eval_mode) {
386     case CEED_EVAL_INTERP: {
387       CeedInt P_1d, Q_1d;
388       CeedInt block_size = data->block_sizes[0];
389 
390       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
391       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
392       CeedInt thread_1d     = CeedIntMax(Q_1d, P_1d);
393       void   *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
394 
395       if (dim == 1) {
396         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
397         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
398         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
399         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
400 
401         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1,
402                                                    elems_per_block, shared_mem, interp_args));
403       } else if (dim == 2) {
404         // Check if required threads is small enough to do multiple elems
405         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
406         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
407         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
408 
409         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
410                                                    thread_1d, elems_per_block, shared_mem, interp_args));
411       } else if (dim == 3) {
412         const CeedInt elems_per_block = 1;
413         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
414         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
415 
416         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
417                                                    thread_1d, elems_per_block, shared_mem, interp_args));
418       }
419     } break;
420     case CEED_EVAL_GRAD: {
421       CeedInt P_1d, Q_1d;
422       CeedInt block_size = data->block_sizes[0];
423 
424       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
425       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
426       CeedInt thread_1d   = CeedIntMax(Q_1d, P_1d);
427       void   *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
428 
429       if (dim == 1) {
430         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
431         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
432         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
433         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
434 
435         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1,
436                                                    elems_per_block, shared_mem, grad_args));
437       } else if (dim == 2) {
438         // Check if required threads is small enough to do multiple elems
439         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
440         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
441         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
442 
443         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
444                                                    elems_per_block, shared_mem, grad_args));
445       } else if (dim == 3) {
446         const CeedInt elems_per_block = 1;
447         CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
448         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
449 
450         CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
451                                                    elems_per_block, shared_mem, grad_args));
452       }
453     } break;
454     case CEED_EVAL_WEIGHT:
455     case CEED_EVAL_NONE: /* handled separately below */
456       break;
457     // LCOV_EXCL_START
458     case CEED_EVAL_DIV:
459     case CEED_EVAL_CURL:
460       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
461       // LCOV_EXCL_STOP
462   }
463 
464   // Restore vectors, cover CEED_EVAL_NONE
465   CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
466   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
467   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
468   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
469   return CEED_ERROR_SUCCESS;
470 }
471 
472 static int CeedBasisApplyAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
473                                              CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
474   CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
475   return CEED_ERROR_SUCCESS;
476 }
477 
478 static int CeedBasisApplyAddAtPoints_Hip_shared(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
479                                                 CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
480   CeedCallBackend(CeedBasisApplyAtPointsCore_Hip_shared(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
481   return CEED_ERROR_SUCCESS;
482 }
483 
484 //------------------------------------------------------------------------------
485 // Apply basis
486 //------------------------------------------------------------------------------
487 static int CeedBasisApplyNonTensorCore_Hip_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode,
488                                                   CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
489   Ceed                  ceed;
490   Ceed_Hip             *ceed_Hip;
491   CeedInt               dim, num_comp;
492   const CeedScalar     *d_u;
493   CeedScalar           *d_v;
494   CeedBasis_Hip_shared *data;
495 
496   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
497   CeedCallBackend(CeedGetData(ceed, &ceed_Hip));
498   CeedCallBackend(CeedBasisGetData(basis, &data));
499   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
500   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
501 
502   // Get read/write access to u, v
503   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
504   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
505   if (apply_add) CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
506   else CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
507 
508   // Apply basis operation
509   switch (eval_mode) {
510     case CEED_EVAL_INTERP: {
511       CeedInt P, Q;
512 
513       CeedCheck(data->d_interp_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; interp not set", CeedEvalModes[eval_mode]);
514       CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
515       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
516       CeedInt thread        = CeedIntMax(Q, P);
517       void   *interp_args[] = {(void *)&num_elem, &data->d_interp_1d, &d_u, &d_v};
518 
519       {
520         CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
521         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
522         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
523         CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
524 
525         if (t_mode == CEED_TRANSPOSE) {
526           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, grid, thread, 1,
527                                                      elems_per_block, shared_mem, interp_args));
528         } else {
529           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, grid, thread, 1, elems_per_block, shared_mem, interp_args));
530         }
531       }
532     } break;
533     case CEED_EVAL_GRAD: {
534       CeedInt P, Q;
535 
536       CeedCheck(data->d_grad_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; grad not set", CeedEvalModes[eval_mode]);
537       CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
538       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
539       CeedInt thread      = CeedIntMax(Q, P);
540       void   *grad_args[] = {(void *)&num_elem, &data->d_grad_1d, &d_u, &d_v};
541 
542       {
543         CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
544         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
545         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
546         CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);
547 
548         if (t_mode == CEED_TRANSPOSE) {
549           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, grid, thread, 1, elems_per_block,
550                                                      shared_mem, grad_args));
551         } else {
552           CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, grid, thread, 1, elems_per_block, shared_mem, grad_args));
553         }
554       }
555     } break;
556     case CEED_EVAL_WEIGHT: {
557       CeedInt P, Q;
558 
559       CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
560       CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
561       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
562       CeedInt thread        = CeedIntMax(Q, P);
563       void   *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
564 
565       {
566         CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64;
567         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
568         const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
569 
570         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, thread, elems_per_block, 1, weight_args));
571       }
572     } break;
573     case CEED_EVAL_NONE: /* handled separately below */
574       break;
575     // LCOV_EXCL_START
576     case CEED_EVAL_DIV:
577     case CEED_EVAL_CURL:
578       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
579       // LCOV_EXCL_STOP
580   }
581 
582   // Restore vectors, cover CEED_EVAL_NONE
583   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
584   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
585   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
586   CeedCallBackend(CeedDestroy(&ceed));
587   return CEED_ERROR_SUCCESS;
588 }
589 
590 int CeedBasisApplyNonTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
591                                        CeedVector v) {
592   CeedCallBackend(CeedBasisApplyNonTensorCore_Hip_shared(basis, false, num_elem, t_mode, eval_mode, u, v));
593   return CEED_ERROR_SUCCESS;
594 }
595 
596 int CeedBasisApplyAddNonTensor_Hip_shared(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
597                                           CeedVector v) {
598   CeedCallBackend(CeedBasisApplyNonTensorCore_Hip_shared(basis, true, num_elem, t_mode, eval_mode, u, v));
599   return CEED_ERROR_SUCCESS;
600 }
601 
602 //------------------------------------------------------------------------------
603 // Destroy basis
604 //------------------------------------------------------------------------------
605 static int CeedBasisDestroy_Hip_shared(CeedBasis basis) {
606   Ceed                  ceed;
607   CeedBasis_Hip_shared *data;
608 
609   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
610   CeedCallBackend(CeedBasisGetData(basis, &data));
611   CeedCallHip(ceed, hipModuleUnload(data->module));
612   if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
613   if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
614   CeedCallBackend(CeedFree(&data->h_points_per_elem));
615   if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
616   CeedCallHip(ceed, hipFree(data->d_interp_1d));
617   CeedCallHip(ceed, hipFree(data->d_grad_1d));
618   CeedCallHip(ceed, hipFree(data->d_collo_grad_1d));
619   CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
620   CeedCallBackend(CeedFree(&data));
621   return CEED_ERROR_SUCCESS;
622 }
623 
624 //------------------------------------------------------------------------------
625 // Create tensor basis
626 //------------------------------------------------------------------------------
627 int CeedBasisCreateTensorH1_Hip_shared(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
628                                        const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
629   Ceed                  ceed;
630   CeedInt               num_comp;
631   const CeedInt         q_bytes      = Q_1d * sizeof(CeedScalar);
632   const CeedInt         interp_bytes = q_bytes * P_1d;
633   CeedBasis_Hip_shared *data;
634 
635   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
636   CeedCallBackend(CeedCalloc(1, &data));
637 
638   // Copy basis data to GPU
639   if (q_weight_1d) {
640     CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
641     CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice));
642   }
643   CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes));
644   CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice));
645   CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes));
646   CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice));
647 
648   // Compute collocated gradient and copy to GPU
649   data->d_collo_grad_1d    = NULL;
650   bool has_collocated_grad = dim == 3 && Q_1d >= P_1d;
651 
652   if (has_collocated_grad) {
653     CeedScalar *collo_grad_1d;
654 
655     CeedCallBackend(CeedMalloc(Q_1d * Q_1d, &collo_grad_1d));
656     CeedCallBackend(CeedBasisGetCollocatedGrad(basis, collo_grad_1d));
657     CeedCallHip(ceed, hipMalloc((void **)&data->d_collo_grad_1d, q_bytes * Q_1d));
658     CeedCallHip(ceed, hipMemcpy(data->d_collo_grad_1d, collo_grad_1d, q_bytes * Q_1d, hipMemcpyHostToDevice));
659     CeedCallBackend(CeedFree(&collo_grad_1d));
660   }
661 
662   // Set number of threads per block for basis kernels
663   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
664   CeedCallBackend(ComputeBasisThreadBlockSizes(dim, P_1d, Q_1d, num_comp, data->block_sizes));
665 
666   // Compile basis kernels
667   const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-tensor.h>\n";
668 
669   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 11, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
670                                   CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
671                                   "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_INTERP_BLOCK_SIZE", data->block_sizes[0], "BASIS_GRAD_BLOCK_SIZE",
672                                   data->block_sizes[1], "BASIS_WEIGHT_BLOCK_SIZE", data->block_sizes[2], "BASIS_HAS_COLLOCATED_GRAD",
673                                   has_collocated_grad));
674   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
675   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
676   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
677   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
678   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose));
679   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
680   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
681 
682   CeedCallBackend(CeedBasisSetData(basis, data));
683 
684   // Register backend functions
685   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyTensor_Hip_shared));
686   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddTensor_Hip_shared));
687   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip_shared));
688   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip_shared));
689   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared));
690   CeedCallBackend(CeedDestroy(&ceed));
691   return CEED_ERROR_SUCCESS;
692 }
693 
694 //------------------------------------------------------------------------------
695 // Create non-tensor basis
696 //------------------------------------------------------------------------------
697 int CeedBasisCreateH1_Hip_shared(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
698                                  const CeedScalar *grad, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
699   Ceed                  ceed;
700   CeedInt               num_comp, q_comp_interp, q_comp_grad;
701   const CeedInt         q_bytes = num_qpts * sizeof(CeedScalar);
702   CeedBasis_Hip_shared *data;
703 
704   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
705 
706   // Check shared memory size
707   {
708     Ceed_Hip *hip_data;
709 
710     CeedCallBackend(CeedGetData(ceed, &hip_data));
711     if (((size_t)num_nodes * (size_t)num_qpts * (size_t)dim + (size_t)CeedIntMax(num_nodes, num_qpts)) * sizeof(CeedScalar) >
712         hip_data->device_prop.sharedMemPerBlock) {
713       CeedCallBackend(CeedBasisCreateH1Fallback(ceed, topo, dim, num_nodes, num_qpts, interp, grad, q_ref, q_weight, basis));
714       return CEED_ERROR_SUCCESS;
715     }
716   }
717 
718   CeedCallBackend(CeedCalloc(1, &data));
719 
720   // Copy basis data to GPU
721   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
722   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
723   if (q_weight) {
724     CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
725     CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight, q_bytes, hipMemcpyHostToDevice));
726   }
727   if (interp) {
728     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
729 
730     CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes));
731     CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp, interp_bytes, hipMemcpyHostToDevice));
732   }
733   if (grad) {
734     const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad;
735 
736     CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, grad_bytes));
737     CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad, grad_bytes, hipMemcpyHostToDevice));
738   }
739 
740   // Compile basis kernels
741   const char basis_kernel_source[] = "// Non-tensor basis source\n#include <ceed/jit-source/hip/hip-shared-basis-nontensor.h>\n";
742 
743   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
744   CeedCallBackend(ComputeBasisThreadBlockSizes(dim, num_nodes, num_qpts, num_comp, data->block_sizes));
745   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 6, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "T_1D",
746                                   CeedIntMax(num_qpts, num_nodes), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_INTERP_BLOCK_SIZE",
747                                   data->block_sizes[0]));
748   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
749   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
750   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTransposeAdd", &data->InterpTransposeAdd));
751   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
752   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTranspose", &data->GradTranspose));
753   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "GradTransposeAdd", &data->GradTransposeAdd));
754   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
755 
756   CeedCallBackend(CeedBasisSetData(basis, data));
757 
758   // Register backend functions
759   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip_shared));
760   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip_shared));
761   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip_shared));
762   CeedCallBackend(CeedDestroy(&ceed));
763   return CEED_ERROR_SUCCESS;
764 }
765 
766 //------------------------------------------------------------------------------
767