xref: /libCEED/backends/hip-ref/ceed-hip-ref-basis.c (revision 5aed82e4fa97acf4ba24a7f10a35f5303a6798e0)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <ceed/jit-tools.h>
11 #include <hip/hip_runtime.h>
12 
13 #include "../hip/ceed-hip-common.h"
14 #include "../hip/ceed-hip-compile.h"
15 #include "ceed-hip-ref.h"
16 
17 //------------------------------------------------------------------------------
18 // Basis apply - tensor
19 //------------------------------------------------------------------------------
20 int CeedBasisApply_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u, CeedVector v) {
21   Ceed              ceed;
22   CeedInt           Q_1d, dim;
23   const CeedInt     is_transpose   = t_mode == CEED_TRANSPOSE;
24   const int         max_block_size = 64;
25   const CeedScalar *d_u;
26   CeedScalar       *d_v;
27   CeedBasis_Hip    *data;
28 
29   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
30   CeedCallBackend(CeedBasisGetData(basis, &data));
31 
32   // Get read/write access to u, v
33   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
34   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
35   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
36 
37   // Clear v for transpose operation
38   if (is_transpose) {
39     CeedSize length;
40 
41     CeedCallBackend(CeedVectorGetLength(v, &length));
42     CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
43   }
44   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
45   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
46 
47   // Basis action
48   switch (eval_mode) {
49     case CEED_EVAL_INTERP: {
50       void         *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &d_u, &d_v};
51       const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
52 
53       CeedCallBackend(CeedRunKernel_Hip(ceed, data->Interp, num_elem, block_size, interp_args));
54     } break;
55     case CEED_EVAL_GRAD: {
56       void         *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_interp_1d, &data->d_grad_1d, &d_u, &d_v};
57       const CeedInt block_size  = max_block_size;
58 
59       CeedCallBackend(CeedRunKernel_Hip(ceed, data->Grad, num_elem, block_size, grad_args));
60     } break;
61     case CEED_EVAL_WEIGHT: {
62       void     *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
63       const int block_size_x  = Q_1d;
64       const int block_size_y  = dim >= 2 ? Q_1d : 1;
65 
66       CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, num_elem, block_size_x, block_size_y, 1, weight_args));
67     } break;
68     case CEED_EVAL_NONE: /* handled separately below */
69       break;
70     // LCOV_EXCL_START
71     case CEED_EVAL_DIV:
72     case CEED_EVAL_CURL:
73       return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
74       // LCOV_EXCL_STOP
75   }
76 
77   // Restore vectors, cover CEED_EVAL_NONE
78   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
79   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
80   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
81   return CEED_ERROR_SUCCESS;
82 }
83 
84 //------------------------------------------------------------------------------
85 // Basis apply - non-tensor
86 //------------------------------------------------------------------------------
87 int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt num_elem, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector u,
88                                 CeedVector v) {
89   Ceed                    ceed;
90   CeedInt                 num_nodes, num_qpts;
91   const CeedInt           is_transpose    = t_mode == CEED_TRANSPOSE;
92   const int               elems_per_block = 1;
93   const int               grid            = CeedDivUpInt(num_elem, elems_per_block);
94   const CeedScalar       *d_u;
95   CeedScalar             *d_v;
96   CeedBasisNonTensor_Hip *data;
97 
98   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
99   CeedCallBackend(CeedBasisGetData(basis, &data));
100   CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &num_qpts));
101   CeedCallBackend(CeedBasisGetNumNodes(basis, &num_nodes));
102 
103   // Get read/write access to u, v
104   if (u != CEED_VECTOR_NONE) CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u));
105   else CeedCheck(eval_mode == CEED_EVAL_WEIGHT, ceed, CEED_ERROR_BACKEND, "An input vector is required for this CeedEvalMode");
106   CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v));
107 
108   // Clear v for transpose operation
109   if (is_transpose) {
110     CeedSize length;
111 
112     CeedCallBackend(CeedVectorGetLength(v, &length));
113     CeedCallHip(ceed, hipMemset(d_v, 0, length * sizeof(CeedScalar)));
114   }
115 
116   // Apply basis operation
117   switch (eval_mode) {
118     case CEED_EVAL_INTERP: {
119       void     *interp_args[] = {(void *)&num_elem, &data->d_interp, &d_u, &d_v};
120       const int block_size_x  = is_transpose ? num_nodes : num_qpts;
121 
122       if (is_transpose) {
123         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args));
124       } else {
125         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args));
126       }
127     } break;
128     case CEED_EVAL_GRAD: {
129       void     *grad_args[]  = {(void *)&num_elem, &data->d_grad, &d_u, &d_v};
130       const int block_size_x = is_transpose ? num_nodes : num_qpts;
131 
132       if (is_transpose) {
133         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args));
134       } else {
135         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args));
136       }
137     } break;
138     case CEED_EVAL_DIV: {
139       void     *div_args[]   = {(void *)&num_elem, &data->d_div, &d_u, &d_v};
140       const int block_size_x = is_transpose ? num_nodes : num_qpts;
141 
142       if (is_transpose) {
143         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args));
144       } else {
145         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args));
146       }
147     } break;
148     case CEED_EVAL_CURL: {
149       void     *curl_args[]  = {(void *)&num_elem, &data->d_curl, &d_u, &d_v};
150       const int block_size_x = is_transpose ? num_nodes : num_qpts;
151 
152       if (is_transpose) {
153         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args));
154       } else {
155         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args));
156       }
157     } break;
158     case CEED_EVAL_WEIGHT: {
159       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight, &d_v};
160 
161       CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid, num_qpts, 1, elems_per_block, weight_args));
162     } break;
163     case CEED_EVAL_NONE: /* handled separately below */
164       break;
165   }
166 
167   // Restore vectors, cover CEED_EVAL_NONE
168   CeedCallBackend(CeedVectorRestoreArray(v, &d_v));
169   if (eval_mode == CEED_EVAL_NONE) CeedCallBackend(CeedVectorSetArray(v, CEED_MEM_DEVICE, CEED_COPY_VALUES, (CeedScalar *)d_u));
170   if (eval_mode != CEED_EVAL_WEIGHT) CeedCallBackend(CeedVectorRestoreArrayRead(u, &d_u));
171   return CEED_ERROR_SUCCESS;
172 }
173 
174 //------------------------------------------------------------------------------
175 // Destroy tensor basis
176 //------------------------------------------------------------------------------
177 static int CeedBasisDestroy_Hip(CeedBasis basis) {
178   Ceed           ceed;
179   CeedBasis_Hip *data;
180 
181   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
182   CeedCallBackend(CeedBasisGetData(basis, &data));
183   CeedCallHip(ceed, hipModuleUnload(data->module));
184   CeedCallHip(ceed, hipFree(data->d_q_weight_1d));
185   CeedCallHip(ceed, hipFree(data->d_interp_1d));
186   CeedCallHip(ceed, hipFree(data->d_grad_1d));
187   CeedCallBackend(CeedFree(&data));
188   return CEED_ERROR_SUCCESS;
189 }
190 
191 //------------------------------------------------------------------------------
192 // Destroy non-tensor basis
193 //------------------------------------------------------------------------------
194 static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) {
195   Ceed                    ceed;
196   CeedBasisNonTensor_Hip *data;
197 
198   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
199   CeedCallBackend(CeedBasisGetData(basis, &data));
200   CeedCallHip(ceed, hipModuleUnload(data->module));
201   CeedCallHip(ceed, hipFree(data->d_q_weight));
202   CeedCallHip(ceed, hipFree(data->d_interp));
203   CeedCallHip(ceed, hipFree(data->d_grad));
204   CeedCallHip(ceed, hipFree(data->d_div));
205   CeedCallHip(ceed, hipFree(data->d_curl));
206   CeedCallBackend(CeedFree(&data));
207   return CEED_ERROR_SUCCESS;
208 }
209 
210 //------------------------------------------------------------------------------
211 // Create tensor
212 //------------------------------------------------------------------------------
213 int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P_1d, CeedInt Q_1d, const CeedScalar *interp_1d, const CeedScalar *grad_1d,
214                                 const CeedScalar *q_ref_1d, const CeedScalar *q_weight_1d, CeedBasis basis) {
215   Ceed           ceed;
216   char          *basis_kernel_source;
217   const char    *basis_kernel_path;
218   CeedInt        num_comp;
219   const CeedInt  q_bytes      = Q_1d * sizeof(CeedScalar);
220   const CeedInt  interp_bytes = q_bytes * P_1d;
221   CeedBasis_Hip *data;
222 
223   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
224   CeedCallBackend(CeedCalloc(1, &data));
225 
226   // Copy data to GPU
227   CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight_1d, q_bytes));
228   CeedCallHip(ceed, hipMemcpy(data->d_q_weight_1d, q_weight_1d, q_bytes, hipMemcpyHostToDevice));
229   CeedCallHip(ceed, hipMalloc((void **)&data->d_interp_1d, interp_bytes));
230   CeedCallHip(ceed, hipMemcpy(data->d_interp_1d, interp_1d, interp_bytes, hipMemcpyHostToDevice));
231   CeedCallHip(ceed, hipMalloc((void **)&data->d_grad_1d, interp_bytes));
232   CeedCallHip(ceed, hipMemcpy(data->d_grad_1d, grad_1d, interp_bytes, hipMemcpyHostToDevice));
233 
234   // Compile basis kernels
235   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
236   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-tensor.h", &basis_kernel_path));
237   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
238   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
239   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
240   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 7, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
241                                   num_comp * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
242                                   "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim)));
243   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
244   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Grad", &data->Grad));
245   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
246   CeedCallBackend(CeedFree(&basis_kernel_path));
247   CeedCallBackend(CeedFree(&basis_kernel_source));
248 
249   CeedCallBackend(CeedBasisSetData(basis, data));
250 
251   // Register backend functions
252   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApply_Hip));
253   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroy_Hip));
254   return CEED_ERROR_SUCCESS;
255 }
256 
257 //------------------------------------------------------------------------------
258 // Create non-tensor H^1
259 //------------------------------------------------------------------------------
260 int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *grad,
261                           const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
262   Ceed                    ceed;
263   char                   *basis_kernel_source;
264   const char             *basis_kernel_path;
265   CeedInt                 num_comp, q_comp_interp, q_comp_grad;
266   const CeedInt           q_bytes = num_qpts * sizeof(CeedScalar);
267   CeedBasisNonTensor_Hip *data;
268 
269   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
270   CeedCallBackend(CeedCalloc(1, &data));
271 
272   // Copy basis data to GPU
273   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
274   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
275   CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
276   CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
277   if (interp) {
278     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
279 
280     CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes));
281     CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice));
282   }
283   if (grad) {
284     const CeedInt grad_bytes = q_bytes * num_nodes * q_comp_grad;
285 
286     CeedCallHip(ceed, hipMalloc((void **)&data->d_grad, grad_bytes));
287     CeedCallHip(ceed, hipMemcpy(data->d_grad, grad, grad_bytes, hipMemcpyHostToDevice));
288   }
289 
290   // Compile basis kernels
291   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
292   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path));
293   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
294   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
295   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
296   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
297                                   q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_grad, "BASIS_NUM_COMP", num_comp));
298   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
299   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
300   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv));
301   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
302   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
303   CeedCallBackend(CeedFree(&basis_kernel_path));
304   CeedCallBackend(CeedFree(&basis_kernel_source));
305 
306   CeedCallBackend(CeedBasisSetData(basis, data));
307 
308   // Register backend functions
309   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
310   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
311   return CEED_ERROR_SUCCESS;
312 }
313 
314 //------------------------------------------------------------------------------
315 // Create non-tensor H(div)
316 //------------------------------------------------------------------------------
317 int CeedBasisCreateHdiv_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp, const CeedScalar *div,
318                             const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
319   Ceed                    ceed;
320   char                   *basis_kernel_source;
321   const char             *basis_kernel_path;
322   CeedInt                 num_comp, q_comp_interp, q_comp_div;
323   const CeedInt           q_bytes = num_qpts * sizeof(CeedScalar);
324   CeedBasisNonTensor_Hip *data;
325 
326   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
327   CeedCallBackend(CeedCalloc(1, &data));
328 
329   // Copy basis data to GPU
330   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
331   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_DIV, &q_comp_div));
332   CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
333   CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
334   if (interp) {
335     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
336 
337     CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes));
338     CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice));
339   }
340   if (div) {
341     const CeedInt div_bytes = q_bytes * num_nodes * q_comp_div;
342 
343     CeedCallHip(ceed, hipMalloc((void **)&data->d_div, div_bytes));
344     CeedCallHip(ceed, hipMemcpy(data->d_div, div, div_bytes, hipMemcpyHostToDevice));
345   }
346 
347   // Compile basis kernels
348   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
349   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path));
350   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
351   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
352   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
353   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
354                                   q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_div, "BASIS_NUM_COMP", num_comp));
355   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
356   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
357   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv));
358   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
359   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
360   CeedCallBackend(CeedFree(&basis_kernel_path));
361   CeedCallBackend(CeedFree(&basis_kernel_source));
362 
363   CeedCallBackend(CeedBasisSetData(basis, data));
364 
365   // Register backend functions
366   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
367   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
368   return CEED_ERROR_SUCCESS;
369 }
370 
371 //------------------------------------------------------------------------------
372 // Create non-tensor H(curl)
373 //------------------------------------------------------------------------------
374 int CeedBasisCreateHcurl_Hip(CeedElemTopology topo, CeedInt dim, CeedInt num_nodes, CeedInt num_qpts, const CeedScalar *interp,
375                              const CeedScalar *curl, const CeedScalar *q_ref, const CeedScalar *q_weight, CeedBasis basis) {
376   Ceed                    ceed;
377   char                   *basis_kernel_source;
378   const char             *basis_kernel_path;
379   CeedInt                 num_comp, q_comp_interp, q_comp_curl;
380   const CeedInt           q_bytes = num_qpts * sizeof(CeedScalar);
381   CeedBasisNonTensor_Hip *data;
382 
383   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
384   CeedCallBackend(CeedCalloc(1, &data));
385 
386   // Copy basis data to GPU
387   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
388   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_CURL, &q_comp_curl));
389   CeedCallHip(ceed, hipMalloc((void **)&data->d_q_weight, q_bytes));
390   CeedCallHip(ceed, hipMemcpy(data->d_q_weight, q_weight, q_bytes, hipMemcpyHostToDevice));
391   if (interp) {
392     const CeedInt interp_bytes = q_bytes * num_nodes * q_comp_interp;
393 
394     CeedCallHip(ceed, hipMalloc((void **)&data->d_interp, interp_bytes));
395     CeedCallHip(ceed, hipMemcpy(data->d_interp, interp, interp_bytes, hipMemcpyHostToDevice));
396   }
397   if (curl) {
398     const CeedInt curl_bytes = q_bytes * num_nodes * q_comp_curl;
399 
400     CeedCallHip(ceed, hipMalloc((void **)&data->d_curl, curl_bytes));
401     CeedCallHip(ceed, hipMemcpy(data->d_curl, curl, curl_bytes, hipMemcpyHostToDevice));
402   }
403 
404   // Compile basis kernels
405   CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
406   CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/hip/hip-ref-basis-nontensor.h", &basis_kernel_path));
407   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source -----\n");
408   CeedCallBackend(CeedLoadSourceToBuffer(ceed, basis_kernel_path, &basis_kernel_source));
409   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Basis Kernel Source Complete! -----\n");
410   CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->module, 5, "BASIS_Q", num_qpts, "BASIS_P", num_nodes, "BASIS_Q_COMP_INTERP",
411                                   q_comp_interp, "BASIS_Q_COMP_DERIV", q_comp_curl, "BASIS_NUM_COMP", num_comp));
412   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Interp", &data->Interp));
413   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "InterpTranspose", &data->InterpTranspose));
414   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Deriv", &data->Deriv));
415   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "DerivTranspose", &data->DerivTranspose));
416   CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, "Weight", &data->Weight));
417   CeedCallBackend(CeedFree(&basis_kernel_path));
418   CeedCallBackend(CeedFree(&basis_kernel_source));
419 
420   CeedCallBackend(CeedBasisSetData(basis, data));
421 
422   // Register backend functions
423   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Apply", CeedBasisApplyNonTensor_Hip));
424   CeedCallBackend(CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", CeedBasisDestroyNonTensor_Hip));
425   return CEED_ERROR_SUCCESS;
426 }
427 
428 //------------------------------------------------------------------------------
429