xref: /libCEED/backends/hip-ref/ceed-hip-ref-basis.c (revision 7f85f93f24a11b92d3b3e2f328e571b30ab828a9)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <string.h>
12 #include <hip/hip_runtime.h>
13 
14 #include "../hip/ceed-hip-common.h"
15 #include "../hip/ceed-hip-compile.h"
16 #include "ceed-hip-ref.h"
17 
18 //------------------------------------------------------------------------------
19 // Basis apply - tensor
20 //------------------------------------------------------------------------------
21 static int CeedBasisApplyCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
22                                   CeedVector u, CeedVector v) {
23   Ceed              ceed;
24   CeedInt           Q_1d, dim;
25   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
26   const int         max_block_size = 64;
27   const CeedScalar *d_u;
28   CeedScalar       *d_v;
29   CeedBasis_Hip    *data;
30 
31   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
32   CeedCallBackend(CeedBasisGetData(basis, &data));
33 
34   // Get read/write access to u, v
35   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
36   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
37   if (apply_add) {
38     CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
39   } else {
40     // Clear v for transpose operation
41     if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
42     CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
43   }
44 
45   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
46   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
47 
48   // Basis action
49   switch (eval_mode) {
50     case CEED_EVAL_INTERP: {
51       void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v};
52       const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
53 
54       CeedCallBackend(CeedRunKernel_Hip(ceed, data->Interp, num_elem, block_size, interp_args));
55     } break;
56     case CEED_EVAL_GRAD: {
57       void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v};
58       const CeedInt block_size  = max_block_size;
59 
60       CeedCallBackend(CeedRunKernel_Hip(ceed, data->Grad, num_elem, block_size, grad_args));
61     } break;
62     case CEED_EVAL_WEIGHT: {
63       CeedCheck(data->d_q_weight_1d, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights_1d not set", CeedEvalModes[eval_mode]);
64       void     *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
65       const int block_size_x  = Q_1d;
66       const int block_size_y  = dim >= 2 ? Q_1d : 1;
67 
68       CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args));
69     } break;
70     case CEED_EVAL_NONE: /* handled separately below */
71       break;
72     // LCOV_EXCL_START
73     case CEED_EVAL_DIV:
74     case CEED_EVAL_CURL:
75       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
76       // LCOV_EXCL_STOP
77   }
78 
79   // Restore vectors, cover CEED_EVAL_NONE
80   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
81   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
82   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
83   CeedCallBackend(CeedDestroy(&ceed));
84   return CEED_ERROR_SUCCESS;
85 }
86 
87 static int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
88   CeedCallBackend(CeedBasisApplyCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v));
89   return CEED_ERROR_SUCCESS;
90 }
91 
92 static int CeedBasisApplyAdd_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
93                                  CeedVector v) {
94   CeedCallBackend(CeedBasisApplyCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v));
95   return CEED_ERROR_SUCCESS;
96 }
97 
98 //------------------------------------------------------------------------------
99 // Basis apply - tensor AtPoints
100 //------------------------------------------------------------------------------
101 static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
102                                           CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
103   Ceed              ceed;
104   CeedInt           Q_1d, dim, max_num_points = num_points[0];
105   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
106   const int         max_block_size = 32;
107   const CeedScalar *d_x, *d_u;
108   CeedScalar       *d_v;
109   CeedBasis_Hip    *data;
110 
111   CeedCallBackend(CeedBasisGetData(basis, &data));
112   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
113   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
114 
115   // Weight handled separately
116   if (eval_mode == CEED_EVAL_WEIGHT) {
117     CeedCallBackend(CeedVectorSetValue(v, 1.0));
118     return CEED_ERROR_SUCCESS;
119   }
120 
121   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
122 
123   // Check padded to uniform number of points per elem
124   for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
125   {
126     CeedInt  num_comp, q_comp;
127     CeedSize len, len_required;
128 
129     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
130     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
131     CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
132     len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
133     CeedCheck(len >= len_required, ceed, CEED_ERROR_BACKEND,
134               "Vector at points must be padded to the same number of points in each element for BasisApplyAtPoints on GPU backends."
135               " Found %" CeedSize_FMT ", Required %" CeedSize_FMT,
136               len, len_required);
137   }
138 
139   // Move num_points array to device
140   if (is_transpose) {
141     const CeedInt num_bytes = num_elem * sizeof(CeedInt);
142 
143     if (num_elem != data->num_elem_at_points) {
144       data->num_elem_at_points = num_elem;
145 
146       if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
147       CeedCallHip(ceed, hipMalloc((void **)&data->d_points_per_elem, num_bytes));
148       CeedCallBackend(CeedFree(&data->h_points_per_elem));
149       CeedCallBackend(CeedCalloc(num_elem, &data->h_points_per_elem));
150     }
151     if (memcmp(data->h_points_per_elem, num_points, num_bytes)) {
152       memcpy(data->h_points_per_elem, num_points, num_bytes);
153       CeedCallHip(ceed, hipMemcpy(data->d_points_per_elem, num_points, num_bytes, hipMemcpyHostToDevice));
154     }
155   }
156 
157   // Build kernels if needed
158   if (data->num_points != max_num_points) {
159     CeedInt P_1d;
160 
161     CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
162     data->num_points = max_num_points;
163 
164     // -- Create interp matrix to Chebyshev coefficients
165     if (!data->d_chebyshev_interp_1d) {
166       CeedSize    interp_bytes;
167       CeedScalar *chebyshev_interp_1d;
168 
169       interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
170       CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
171       CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
172       CeedCallHip(ceed, hipMalloc((void **)&data->d_chebyshev_interp_1d, interp_bytes));
173       CeedCallHip(ceed, hipMemcpy(data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
174       CeedCallBackend(CeedFree(&chebyshev_interp_1d));
175     }
176 
177     // -- Compile kernels
178     const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h>\n";
179     CeedInt    num_comp;
180 
181     if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
182     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
183     CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
184                                     Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
185                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
186                                     max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
187     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
188     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
189     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
190     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
191   }
192 
193   // Get read/write access to u, v
194   CeedCallBackend(CeedVectorGetArrayRead(x_ref, CEED_MEM_DEVICE, &d_x));
195   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
196   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
197   if (apply_add) {
198     CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
199   } else {
200     // Clear v for transpose operation
201     if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
202     CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
203   }
204 
205   // Basis action
206   switch (eval_mode) {
207     case CEED_EVAL_INTERP: {
208       void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
209       const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
210 
211       CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size,
212                                         interp_args));
213     } break;
214     case CEED_EVAL_GRAD: {
215       void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
216       const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
217 
218       CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
219     } break;
220     case CEED_EVAL_WEIGHT:
221     case CEED_EVAL_NONE: /* handled separately below */
222       break;
223     // LCOV_EXCL_START
224     case CEED_EVAL_DIV:
225     case CEED_EVAL_CURL:
226       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
227       // LCOV_EXCL_STOP
228   }
229 
230   // Restore vectors, cover CEED_EVAL_NONE
231   CeedCallBackend(CeedVectorRestoreArrayRead(x_ref, &d_x));
232   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
233   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
234   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
235   CeedCallBackend(CeedDestroy(&ceed));
236   return CEED_ERROR_SUCCESS;
237 }
238 
239 static int CeedBasisApplyAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
240                                       CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
241   CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, false, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
242   return CEED_ERROR_SUCCESS;
243 }
244 
245 static int CeedBasisApplyAddAtPoints_Hip(CeedBasis basis, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode,
246                                          CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
247   CeedCallBackend(CeedBasisApplyAtPointsCore_Hip(basis, true, num_elem, num_points, t_mode, eval_mode, x_ref, u, v));
248   return CEED_ERROR_SUCCESS;
249 }
250 
251 //------------------------------------------------------------------------------
252 // Basis apply - non-tensor
253 //------------------------------------------------------------------------------
254 static int CeedBasisApplyNonTensorCore_Hip(CeedBasis basis, bool apply_add, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode,
255                                            CeedVector u, CeedVector v) {
256   Ceed                    ceed;
257   CeedInt                 num_nodes, num_qpts;
258   const CeedInt           is_transpose    = t_mode == CEED_TRANSPOSE;
259   const int               elems_per_block = 1;
260   const int               grid            = CeedDivUpInt(num_elem, elems_per_block);
261   const CeedScalar       *d_u;
262   CeedScalar             *d_v;
263   CeedBasisNonTensor_Hip *data;
264 
265   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
266   CeedCallBackend(CeedBasisGetData(basis, &data));
267   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
268   CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
269 
270   // Get read/write access to u, v
271   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
272   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
273   if (apply_add) {
274     CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v));
275   } else {
276     // Clear v for transpose operation
277     if (is_transpose) CeedCallBackend(CeedVectorSetValue(v, 0.0));
278     CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
279   }
280 
281   // Apply basis operation
282   switch (eval_mode) {
283     case CEED_EVAL_INTERP: {
284       void     *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v};
285       const int block_size_x  = is_transpose ? num_nodes : num_qpts;
286 
287       if (is_transpose) {
288         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args));
289       } else {
290         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args));
291       }
292     } break;
293     case CEED_EVAL_GRAD: {
294       void     *grad_args[]  = {(void *)&num_elem, &data->d_grad, &d_u, &d_v};
295       const int block_size_x = is_transpose ? num_nodes : num_qpts;
296 
297       if (is_transpose) {
298         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args));
299       } else {
300         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args));
301       }
302     } break;
303     case CEED_EVAL_DIV: {
304       void     *div_args[]   = {(void *)&num_elem, &data->d_div, &d_u, &d_v};
305       const int block_size_x = is_transpose ? num_nodes : num_qpts;
306 
307       if (is_transpose) {
308         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args));
309       } else {
310         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args));
311       }
312     } break;
313     case CEED_EVAL_CURL: {
314       void     *curl_args[]  = {(void *)&num_elem, &data->d_curl, &d_u, &d_v};
315       const int block_size_x = is_transpose ? num_nodes : num_qpts;
316 
317       if (is_transpose) {
318         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args));
319       } else {
320         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args));
321       }
322     } break;
323     case CEED_EVAL_WEIGHT: {
324       CeedCheck(data->d_q_weight, ceed, CEED_ERROR_BACKEND, "%s not supported; q_weights not set", CeedEvalModes[eval_mode]);
325       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v};
326 
327       CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args));
328     } break;
329     case CEED_EVAL_NONE: /* handled separately below */
330       break;
331   }
332 
333   // Restore vectors, cover CEED_EVAL_NONE
334   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
335   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
336   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
337   CeedCallBackend(CeedDestroy(&ceed));
338   return CEED_ERROR_SUCCESS;
339 }
340 
341 static int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
342                                        CeedVector v) {
343   CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, false, num_elem, t_mode, eval_mode, u, v));
344   return CEED_ERROR_SUCCESS;
345 }
346 
347 static int CeedBasisApplyAddNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
348                                           CeedVector v) {
349   CeedCallBackend(CeedBasisApplyNonTensorCore_Hip(basis, true, num_elem, t_mode, eval_mode, u, v));
350   return CEED_ERROR_SUCCESS;
351 }
352 
353 //------------------------------------------------------------------------------
354 // Destroy tensor basis
355 //------------------------------------------------------------------------------
356 static int CeedBasisDestroy_Hip(CeedBasis basis) {
357   Ceed           ceed;
358   CeedBasis_Hip *data;
359 
360   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
361   CeedCallBackend(CeedBasisGetData(basis, &data));
362   CeedCallHip(ceed, hipModuleUnload(data->module));
363   if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
364   if (data->d_q_weight_1d) CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
365   CeedCallBackend(CeedFree(&data->h_points_per_elem));
366   if (data->d_points_per_elem) CeedCallHip(ceed, hipFree(data->d_points_per_elem));
367   CeedCallHip(ceed, hipFree(data->d_interp_1d));
368   CeedCallHip(ceed, hipFree(data->d_grad_1d));
369   CeedCallHip(ceed, hipFree(data->d_chebyshev_interp_1d));
370   CeedCallBackend(CeedFree(&data));
371   CeedCallBackend(CeedDestroy(&ceed));
372   return CEED_ERROR_SUCCESS;
373 }
374 
375 //------------------------------------------------------------------------------
376 // Destroy non-tensor basis
377 //------------------------------------------------------------------------------
378 static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) {
379   Ceed                    ceed;
380   CeedBasisNonTensor_Hip *data;
381 
382   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
383   CeedCallBackend(CeedBasisGetData(basis, &data));
384   CeedCallHip(ceed, hipModuleUnload(data->module));
385   if (data->d_q_weight) CeedCallHip(ceed, hipFree(data->d_q_weight));
386   CeedCallHip(ceed, hipFree(data->d_interp));
387   CeedCallHip(ceed, hipFree(data->d_grad));
388   CeedCallHip(ceed, hipFree(data->d_div));
389   CeedCallHip(ceed, hipFree(data->d_curl));
390   CeedCallBackend(CeedFree(&data));
391   CeedCallBackend(CeedDestroy(&ceed));
392   return CEED_ERROR_SUCCESS;
393 }
394 
395 //------------------------------------------------------------------------------
396 // Create tensor
397 //------------------------------------------------------------------------------
398 int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
399                                 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
400   Ceed           ceed;
401   CeedInt        num_comp;
402   const CeedInt  q_bytes      = Q_1d * sizeof(CeedScalar);
403   const CeedInt  interp_bytes = q_bytes * P_1d;
404   CeedBasis_Hip *data;
405 
406   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
407   CeedCallBackend(CeedCalloc(1, &data));
408 
409   // Copy data to GPU
410   if (q_weight_1d) {
411     CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
412     CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice));
413   }
414   CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes));
415   CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice));
416   CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes));
417   CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice));
418 
419   // Compile basis kernels
420   const char basis_kernel_source[] = "// Tensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-tensor.h>\n";
421 
422   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
423   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
424                                   Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
425                                   "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)));
426   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
427   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
428   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
429 
430   CeedCallBackend(CeedBasisSetData(basis, data));
431 
432   // Register backend functions
433   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip));
434   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAdd_Hip));
435   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAtPoints", CeedBasisApplyAtPoints_Hip));
436   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAddAtPoints", CeedBasisApplyAddAtPoints_Hip));
437   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip));
438   CeedCallBackend(CeedDestroy(&ceed));
439   return CEED_ERROR_SUCCESS;
440 }
441 
442 //------------------------------------------------------------------------------
443 // Create non-tensor H^1
444 //------------------------------------------------------------------------------
445 int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
446                           const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
447   Ceed                    ceed;
448   CeedInt                 num_comp, q_comp_interp, q_comp_grad;
449   const CeedInt           q_bytes = num_qpts * sizeof(CeedScalar);
450   CeedBasisNonTensor_Hip *data;
451 
452   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
453   CeedCallBackend(CeedCalloc(1, &data));
454 
455   // Copy basis data to GPU
456   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
457   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
458   if (q_weight) {
459     CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
460     CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
461   }
462   if (interp) {
463     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
464 
465     CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes));
466     CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice));
467   }
468   if (grad) {
469     const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad;
470 
471     CeedCallHip(ceed, hipMalloc((void **)&data->d_grad, grad_bytes));
472     CeedCallHip(ceed, hipMemcpy(data->d_grad, grad, grad_bytes, hipMemcpyHostToDevice));
473   }
474 
475   // Compile basis kernels
476   const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n";
477 
478   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
479   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
480                                   q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp));
481   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
482   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
483   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv));
484   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
485   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
486 
487   CeedCallBackend(CeedBasisSetData(basis, data));
488 
489   // Register backend functions
490   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
491   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
492   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
493   CeedCallBackend(CeedDestroy(&ceed));
494   return CEED_ERROR_SUCCESS;
495 }
496 
497 //------------------------------------------------------------------------------
498 // Create non-tensor H(div)
499 //------------------------------------------------------------------------------
500 int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div,
501                             const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
502   Ceed                    ceed;
503   CeedInt                 num_comp, q_comp_interp, q_comp_div;
504   const CeedInt           q_bytes = num_qpts * sizeof(CeedScalar);
505   CeedBasisNonTensor_Hip *data;
506 
507   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
508   CeedCallBackend(CeedCalloc(1, &data));
509 
510   // Copy basis data to GPU
511   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
512   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
513   if (q_weight) {
514     CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
515     CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
516   }
517   if (interp) {
518     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
519 
520     CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes));
521     CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice));
522   }
523   if (div) {
524     const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div;
525 
526     CeedCallHip(ceed, hipMalloc((void **)&data->d_div, div_bytes));
527     CeedCallHip(ceed, hipMemcpy(data->d_div, div, div_bytes, hipMemcpyHostToDevice));
528   }
529 
530   // Compile basis kernels
531   const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n";
532 
533   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
534   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
535                                   q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp));
536   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
537   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
538   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv));
539   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
540   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
541 
542   CeedCallBackend(CeedBasisSetData(basis, data));
543 
544   // Register backend functions
545   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
546   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
547   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
548   CeedCallBackend(CeedDestroy(&ceed));
549   return CEED_ERROR_SUCCESS;
550 }
551 
552 //------------------------------------------------------------------------------
553 // Create non-tensor H(curl)
554 //------------------------------------------------------------------------------
555 int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
556                              const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
557   Ceed                    ceed;
558   CeedInt                 num_comp, q_comp_interp, q_comp_curl;
559   const CeedInt           q_bytes = num_qpts * sizeof(CeedScalar);
560   CeedBasisNonTensor_Hip *data;
561 
562   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
563   CeedCallBackend(CeedCalloc(1, &data));
564 
565   // Copy basis data to GPU
566   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
567   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
568   if (q_weight) {
569     CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
570     CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
571   }
572   if (interp) {
573     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
574 
575     CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes));
576     CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice));
577   }
578   if (curl) {
579     const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl;
580 
581     CeedCallHip(ceed, hipMalloc((void **)&data->d_curl, curl_bytes));
582     CeedCallHip(ceed, hipMemcpy(data->d_curl, curl, curl_bytes, hipMemcpyHostToDevice));
583   }
584 
585   // Compile basis kernels
586   const char basis_kernel_source[] = "// Nontensor basis source\n#include <ceed/jit-source/hip/hip-ref-basis-nontensor.h>\n";
587 
588   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
589   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
590                                   q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp));
591   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
592   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
593   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv));
594   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
595   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
596 
597   CeedCallBackend(CeedBasisSetData(basis, data));
598 
599   // Register backend functions
600   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
601   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "ApplyAdd", CeedBasisApplyAddNonTensor_Hip));
602   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
603   CeedCallBackend(CeedDestroy(&ceed));
604   return CEED_ERROR_SUCCESS;
605 }
606 
607 //------------------------------------------------------------------------------
608