xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 99421279ebf997f2fb157e1e18f23b9dc112bbac)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/jit-tools.h>
13 
14 #include <iostream>
15 #include <sstream>
16 #include <string>
17 
18 #include "../hip-ref/ceed-hip-ref.h"
19 #include "../hip-shared/ceed-hip-shared.h"
20 #include "../hip/ceed-hip-common.h"
21 #include "../hip/ceed-hip-compile.h"
22 #include "ceed-hip-gen.h"
23 
24 struct FieldReuse_Hip {
25   CeedInt      index;
26   bool         is_input;
27   CeedEvalMode eval_mode;
28 };
29 
30 //------------------------------------------------------------------------------
31 // Calculate the block size used for launching the operator kernel
32 //------------------------------------------------------------------------------
33 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
34   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
35   if (dim == 1) {
36     CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
37 
38     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
39     block_sizes[0]  = thread_1d;
40     block_sizes[1]  = 1;
41     block_sizes[2]  = elems_per_block;
42   } else if (dim == 2) {
43     const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2;
44 
45     block_sizes[0] = thread_1d;
46     block_sizes[1] = thread_1d;
47     block_sizes[2] = elems_per_block;
48   } else if (dim == 3) {
49     const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1);
50 
51     block_sizes[0] = thread_1d;
52     block_sizes[1] = thread_1d;
53     block_sizes[2] = elems_per_block;
54   }
55   return CEED_ERROR_SUCCESS;
56 }
57 
58 //------------------------------------------------------------------------------
59 // Determine type of operator
60 //------------------------------------------------------------------------------
61 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
62                                                CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
63                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P_1d, CeedInt *Q_1d, CeedInt *dim, bool *is_tensor,
64                                                bool *use_3d_slices) {
65   // Find dim, P_1d, Q_1d
66   *max_P_1d  = 0;
67   *Q_1d      = 0;
68   *dim       = 0;
69   *is_tensor = true;
70 
71   for (CeedInt i = 0; i < num_input_fields; i++) {
72     CeedBasis basis;
73 
74     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
75     if (basis != CEED_BASIS_NONE) {
76       bool    is_field_tensor;
77       CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
78 
79       // Collect dim, P_1d, and Q_1d
80       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
81       *is_tensor = *is_tensor && is_field_tensor;
82       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
83       else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
84       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
85       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
86       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
87       *dim = field_dim;
88       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
89       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
90       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
91       *Q_1d = field_Q_1d;
92     }
93     CeedCallBackend(CeedBasisDestroy(&basis));
94   }
95   for (CeedInt i = 0; i < num_output_fields; i++) {
96     CeedBasis basis;
97 
98     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
99     if (basis != CEED_BASIS_NONE) {
100       bool    is_field_tensor;
101       CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
102 
103       // Collect dim, P_1d, and Q_1d
104       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
105       *is_tensor = *is_tensor && is_field_tensor;
106       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
107       else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
108       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
109       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
110       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
111       *dim = field_dim;
112       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
113       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
114       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
115       *Q_1d = field_Q_1d;
116     }
117     CeedCallBackend(CeedBasisDestroy(&basis));
118   }
119 
120   // Only use 3D collocated gradient parallelization strategy when gradient is computed
121   *use_3d_slices = false;
122   if (*dim == 3) {
123     bool was_grad_found = false;
124 
125     for (CeedInt i = 0; i < num_input_fields; i++) {
126       CeedEvalMode eval_mode;
127 
128       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
129       if (eval_mode == CEED_EVAL_GRAD) {
130         CeedBasis_Hip_shared *basis_data;
131         CeedBasis             basis;
132 
133         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
134         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
135         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
136         was_grad_found = true;
137         CeedCallBackend(CeedBasisDestroy(&basis));
138       }
139     }
140     for (CeedInt i = 0; i < num_output_fields; i++) {
141       CeedEvalMode eval_mode;
142 
143       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
144       if (eval_mode == CEED_EVAL_GRAD) {
145         CeedBasis_Hip_shared *basis_data;
146         CeedBasis             basis;
147 
148         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
149         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
150         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
151         was_grad_found = true;
152         CeedCallBackend(CeedBasisDestroy(&basis));
153       }
154     }
155   }
156   return CEED_ERROR_SUCCESS;
157 }
158 
159 //------------------------------------------------------------------------------
160 // Setup fields
161 //------------------------------------------------------------------------------
162 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field,
163                                                     CeedQFunctionField qf_field, FieldReuse_Hip field_reuse, CeedInt Q_1d, bool is_input,
164                                                     bool is_tensor, bool is_at_points, bool use_3d_slices) {
165   const char           *field_name;
166   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
167   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
168   std::string           option_name = (is_input ? "inputs" : "outputs");
169   CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
170   CeedInt               elem_size = 0, num_comp = 0, P_1d = 0;
171   CeedElemRestriction   elem_rstr;
172   CeedBasis_Hip_shared *basis_data;
173   CeedBasis             basis;
174 
175   // Field reuse info
176   bool use_previous_field = field_reuse.index != -1;
177 
178   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
179   code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
180 
181   // Get field data
182   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
183   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
184     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
185     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
186   }
187   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
188   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
189   if (basis != CEED_BASIS_NONE) {
190     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
191     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
192     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
193   }
194   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
195 
196   // Set field constants
197   if (eval_mode != CEED_EVAL_WEIGHT) {
198     code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
199     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
200   }
201 
202   // Load basis data
203   code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
204   switch (eval_mode) {
205     case CEED_EVAL_NONE:
206       break;
207     case CEED_EVAL_INTERP:
208       if (is_at_points) {
209         // AtPoints
210         if (!basis_data->d_chebyshev_interp_1d) {
211           CeedSize    interp_bytes;
212           CeedScalar *chebyshev_interp_1d;
213 
214           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
215           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
216           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
217           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
218           CeedCallHip(CeedBasisReturnCeed(basis),
219                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
220           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
221         }
222         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
223         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
224       } else {
225         // Standard quadrature
226         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
227         else data->B.outputs[i] = basis_data->d_interp_1d;
228       }
229       if (use_previous_field) {
230         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
231 
232         code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
233       } else {
234         code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
235         code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
236       }
237       break;
238     case CEED_EVAL_GRAD:
239       if (is_at_points) {
240         // AtPoints
241         if (!basis_data->d_chebyshev_interp_1d) {
242           CeedSize    interp_bytes;
243           CeedScalar *chebyshev_interp_1d;
244 
245           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
246           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
247           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
248           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
249           CeedCallHip(CeedBasisReturnCeed(basis),
250                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
251           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
252         }
253         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
254         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
255       } else {
256         // Standard quadrature
257         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
258         else data->B.outputs[i] = basis_data->d_interp_1d;
259       }
260       if (is_tensor) {
261         if (use_previous_field) {
262           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
263 
264           code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
265         } else {
266           code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
267           code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
268         }
269       }
270       if (is_at_points) break;  // No G mat for AtPoints
271       if (use_3d_slices) {
272         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
273         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
274         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
275           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
276 
277           code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
278         } else {
279           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
280           code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
281         }
282       } else {
283         bool has_collo_grad = basis_data->d_collo_grad_1d;
284 
285         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
286         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
287         if (has_collo_grad) {
288           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
289             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
290 
291             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
292           } else {
293             code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
294             code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
295           }
296         } else {
297           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
298             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
299 
300             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
301           } else {
302             code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
303             code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
304                  << var_suffix << ");\n";
305           }
306         }
307       }
308       break;
309     case CEED_EVAL_WEIGHT:
310       break;  // No action
311       // LCOV_EXCL_START
312     case CEED_EVAL_DIV:
313     case CEED_EVAL_CURL:
314       break;  // TODO: Not implemented
315               // LCOV_EXCL_STOP
316   }
317   CeedCallBackend(CeedBasisDestroy(&basis));
318   return CEED_ERROR_SUCCESS;
319 }
320 
321 //------------------------------------------------------------------------------
322 // Restriction
323 //------------------------------------------------------------------------------
324 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
325                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
326                                                       CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points, bool use_3d_slices) {
327   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
328   std::string              P_name     = (is_tensor ? "P_1d" : "P") + var_suffix;
329   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
330   CeedInt                  elem_size = 0, num_comp = 0, P_1d = 0;
331   CeedSize                 l_size;
332   CeedRestrictionType      rstr_type = CEED_RESTRICTION_STANDARD;
333   CeedElemRestriction_Hip *rstr_data;
334   CeedElemRestriction      elem_rstr;
335   CeedBasis                basis;
336 
337   // Get field data
338   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
339   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
340     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
341     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
342     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
343     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
344   }
345   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
346   if (basis != CEED_BASIS_NONE) {
347     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
348     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
349   }
350   CeedCallBackend(CeedBasisDestroy(&basis));
351   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
352 
353   // Restriction
354   if (is_input) {
355     // Input
356     if (field_input_buffer[i] != i) {
357       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
358 
359       // Restriction was already done for previous input
360       code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
361     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
362       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
363         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
364         code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
365       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
366         // Otherwise we're using the scratch space
367         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
368       }
369       switch (rstr_type) {
370         case CEED_RESTRICTION_STANDARD: {
371           CeedInt comp_stride;
372 
373           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
374           code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
375           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
376           code << "    // CompStride: " << comp_stride << "\n";
377           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
378           code << "    ReadLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
379                << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
380           break;
381         }
382         case CEED_RESTRICTION_STRIDED: {
383           bool    has_backend_strides;
384           CeedInt num_elem;
385 
386           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
387           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
388           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
389 
390           if (!has_backend_strides) {
391             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
392           }
393           code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
394           code << "    ReadLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
395                << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
396           break;
397         }
398         case CEED_RESTRICTION_POINTS: {
399           CeedInt comp_stride;
400 
401           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
402           code << "    const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
403           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
404           break;
405         }
406         // LCOV_EXCL_START
407         case CEED_RESTRICTION_ORIENTED:
408         case CEED_RESTRICTION_CURL_ORIENTED:
409           break;  // TODO: Not implemented
410                   // LCOV_EXCL_STOP
411       }
412     }
413   } else {
414     // Output
415     switch (rstr_type) {
416       case CEED_RESTRICTION_STANDARD: {
417         CeedInt comp_stride;
418 
419         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
420         code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
421         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
422         code << "    // CompStride: " << comp_stride << "\n";
423         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
424         code << "    WriteLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
425              << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
426         break;
427       }
428       case CEED_RESTRICTION_STRIDED: {
429         bool    has_backend_strides;
430         CeedInt num_elem;
431 
432         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
433         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
434         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
435 
436         if (!has_backend_strides) {
437           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
438         }
439         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
440         code << "    WriteLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
441              << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
442         break;
443       }
444       case CEED_RESTRICTION_POINTS:
445         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
446         break;
447       // LCOV_EXCL_START
448       case CEED_RESTRICTION_ORIENTED:
449       case CEED_RESTRICTION_CURL_ORIENTED:
450         break;  // TODO: Not implemented
451                 // LCOV_EXCL_STOP
452     }
453   }
454   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
455   return CEED_ERROR_SUCCESS;
456 }
457 
458 //------------------------------------------------------------------------------
459 // Basis
460 //------------------------------------------------------------------------------
461 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
462                                                 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor,
463                                                 bool is_at_points, bool use_3d_slices) {
464   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
465   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
466   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
467   CeedInt             elem_size = 0, num_comp = 0, P_1d = 0;
468   CeedElemRestriction elem_rstr;
469   CeedBasis           basis;
470 
471   // Get field data
472   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
473   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
474     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
475     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
476   }
477   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
478   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
479   if (basis != CEED_BASIS_NONE) {
480     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
481     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
482   }
483   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
484 
485   // Basis
486   code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
487   if (is_input) {
488     switch (eval_mode) {
489       case CEED_EVAL_NONE:
490         if (!use_3d_slices && !is_at_points) {
491           code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
492         }
493         break;
494       case CEED_EVAL_INTERP:
495         if (is_at_points) {
496           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
497 
498           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
499           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
500                << var_suffix << ", r_c" << var_suffix << ");\n";
501         } else {
502           std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d") : "InterpNonTensor";
503 
504           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
505           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
506                << var_suffix << ", r_q" << var_suffix << ");\n";
507         }
508         break;
509       case CEED_EVAL_GRAD:
510         if (is_at_points) {
511           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
512 
513           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
514           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
515                << var_suffix << ", r_c" << var_suffix << ");\n";
516         } else if (use_3d_slices) {
517           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
518 
519           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
520           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
521                << var_suffix << ", r_q" << var_suffix << ");\n";
522         } else if (is_tensor) {
523           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
524           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d";
525 
526           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (dim >= 3 ? Q_name : "1") << "];\n";
527           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
528                << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
529         } else {
530           std::string function_name = "GradNonTensor";
531 
532           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
533           code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix
534                << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
535         }
536         break;
537       case CEED_EVAL_WEIGHT: {
538         if (is_at_points) {
539           code << "    // Nothing to do AtPoints\n";
540         } else {
541           CeedBasis_Hip_shared *basis_data;
542           std::string           function_name = is_tensor ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d") : "WeightNonTensor";
543 
544           code << "    CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
545           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
546           data->W = basis_data->d_q_weight_1d;
547           code << "    " << function_name << "<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
548         }
549         break;
550       }
551       // LCOV_EXCL_START
552       case CEED_EVAL_DIV:
553       case CEED_EVAL_CURL:
554         break;  // TODO: Not implemented
555                 // LCOV_EXCL_STOP
556     }
557   } else {
558     switch (eval_mode) {
559       case CEED_EVAL_NONE:
560         code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
561         break;  // No action
562       case CEED_EVAL_INTERP:
563         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
564         if (is_at_points) {
565           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
566 
567           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
568                << var_suffix << ", r_e" << var_suffix << ");\n";
569         } else {
570           std::string function_name =
571               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d") : "InterpTransposeNonTensor";
572 
573           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
574                << var_suffix << ", r_e" << var_suffix << ");\n";
575         }
576         break;
577       case CEED_EVAL_GRAD:
578         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
579         if (is_at_points) {
580           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
581 
582           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
583                << var_suffix << ", r_e" << var_suffix << ");\n";
584         } else if (use_3d_slices) {
585           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
586 
587           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
588                << var_suffix << ", r_e" << var_suffix << ");\n";
589         } else if (is_tensor) {
590           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
591           std::string function_name =
592               (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) + std::to_string(dim) + "d";
593 
594           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
595                << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
596         } else {
597           std::string function_name = "GradTransposeNonTensor";
598 
599           code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix
600                << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
601         }
602         break;
603       // LCOV_EXCL_START
604       case CEED_EVAL_WEIGHT:
605         break;  // Should not occur
606       case CEED_EVAL_DIV:
607       case CEED_EVAL_CURL:
608         break;  // TODO: Not implemented
609                 // LCOV_EXCL_STOP
610     }
611   }
612   CeedCallBackend(CeedBasisDestroy(&basis));
613   return CEED_ERROR_SUCCESS;
614 }
615 
616 //------------------------------------------------------------------------------
617 // QFunction
618 //------------------------------------------------------------------------------
619 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt dim, CeedInt max_num_points,
620                                                     CeedInt num_input_fields, CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
621                                                     CeedInt num_output_fields, CeedOperatorField *op_output_fields,
622                                                     CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d, bool is_tensor,
623                                                     bool is_at_points, bool use_3d_slices) {
624   std::string         Q_name    = is_tensor ? "Q_1d" : "Q";
625   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
626   CeedElemRestriction elem_rstr;
627 
628   // Setup output arrays
629   code << "\n    // -- Output field setup\n";
630   for (CeedInt i = 0; i < num_output_fields; i++) {
631     const char *field_name;
632     std::string var_suffix = "_out_" + std::to_string(i);
633 
634     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
635     code << "    // ---- Output field " << i << ": " << field_name << "\n";
636     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
637     switch (eval_mode) {
638       case CEED_EVAL_NONE:
639         if (is_at_points) {
640           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
641         } else {
642           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
643         }
644         break;
645       case CEED_EVAL_INTERP:
646         if (is_at_points) {
647           // Accumulator for point data
648           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
649           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
650           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
651           code << "    }\n";
652         } else {
653           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
654         }
655         break;
656       case CEED_EVAL_GRAD:
657         if (is_at_points) {
658           // Accumulator for point data
659           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "*dim];\n";
660           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
661           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
662           code << "    }\n";
663         } else if (use_3d_slices) {
664           // Accumulator for gradient slices
665           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
666           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
667           code << "      r_q" << var_suffix << "[i] = 0.0;\n";
668           code << "    }\n";
669         } else {
670           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
671         }
672         break;
673       case CEED_EVAL_WEIGHT:
674         break;
675         // LCOV_EXCL_START
676       case CEED_EVAL_DIV:
677       case CEED_EVAL_CURL:
678         break;  // TODO: Not implemented
679                 // LCOV_EXCL_STOP
680     }
681   }
682 
683   if (is_at_points) {
684     // We need to handle batches of points
685     code << "\n    // Note: Using batches of points\n";
686     code << "    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));\n\n";
687     code << "    #pragma unroll\n";
688     code << "    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {\n";
689     code << "      const CeedInt p = i % max_num_points;\n\n";
690 
691     code << "      // -- Coordinates\n";
692     code << "      CeedScalar r_x[dim];\n";
693     code << "      ReadPoint<dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
694 
695     code << "      // -- Input fields\n";
696     for (CeedInt i = 0; i < num_input_fields; i++) {
697       const char *field_name;
698       std::string var_suffix = "_in_" + std::to_string(i);
699 
700       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
701       code << "      // ---- Input field " << i << ": " << field_name << "\n";
702       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
703       // Basis action
704       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
705       switch (eval_mode) {
706         case CEED_EVAL_NONE:
707           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
708           code << "      ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
709                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
710           break;
711         case CEED_EVAL_INTERP:
712           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
713           code << "      InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
714                << ", r_x, r_s" << var_suffix << ");\n";
715           break;
716         case CEED_EVAL_GRAD:
717           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
718           code << "      GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
719                << ", r_x, r_s" << var_suffix << ");\n";
720           break;
721         case CEED_EVAL_WEIGHT:
722           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
723           code << "      r_s" << var_suffix << "[0] = 1.0;\n";
724           break;
725           // LCOV_EXCL_START
726         case CEED_EVAL_DIV:
727         case CEED_EVAL_CURL:
728           break;  // TODO: Not implemented
729                   // LCOV_EXCL_STOP
730       }
731     }
732     code << "\n      // -- Output fields\n";
733     for (CeedInt i = 0; i < num_output_fields; i++) {
734       const char *field_name;
735       std::string var_suffix = "_out_" + std::to_string(i);
736 
737       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
738       code << "      // ---- Output field " << i << ": " << field_name << "\n";
739       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
740       // Basis action
741       switch (eval_mode) {
742         case CEED_EVAL_NONE:
743           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
744           break;
745         case CEED_EVAL_INTERP:
746           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
747           break;
748         case CEED_EVAL_GRAD:
749           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
750           break;
751           // LCOV_EXCL_START
752         case CEED_EVAL_WEIGHT:
753           break;  // Should not occur
754         case CEED_EVAL_DIV:
755         case CEED_EVAL_CURL:
756           break;  // TODO: Not implemented
757                   // LCOV_EXCL_STOP
758       }
759     }
760 
761   } else if (use_3d_slices) {
762     // We treat quadrature points per slice in 3d to save registers
763     code << "\n    // Note: Using planes of 3D elements\n";
764     code << "    #pragma unroll\n";
765     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
766     code << "      // -- Input fields\n";
767     for (CeedInt i = 0; i < num_input_fields; i++) {
768       const char *field_name;
769       std::string var_suffix = "_in_" + std::to_string(i);
770 
771       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
772       code << "      // ---- Input field " << i << ": " << field_name << "\n";
773       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
774       // Basis action
775       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
776       switch (eval_mode) {
777         case CEED_EVAL_NONE:
778           bool is_strided;
779 
780           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
781 
782           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
783           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
784           if (is_strided) {
785             bool    has_backend_strides;
786             CeedInt num_elem, elem_size;
787 
788             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
789             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
790             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
791             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
792 
793             if (!has_backend_strides) {
794               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
795             }
796             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
797             code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", " << strides[0] << ", " << strides[1] << ", "
798                  << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
799           } else {
800             CeedSize                 l_size = 0;
801             CeedInt                  comp_stride;
802             CeedElemRestriction_Hip *rstr_data;
803 
804             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
805             code << "      const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
806             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
807             code << "      // CompStride: " << comp_stride << "\n";
808             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
809             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
810             code << "      ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
811                  << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
812           }
813           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
814           break;
815         case CEED_EVAL_INTERP:
816           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
817           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
818           code << "        r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
819           code << "      }\n";
820           break;
821         case CEED_EVAL_GRAD:
822           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
823           code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix
824                << ", r_s" << var_suffix << ");\n";
825           break;
826         case CEED_EVAL_WEIGHT:
827           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
828           code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
829           break;
830           // LCOV_EXCL_START
831         case CEED_EVAL_DIV:
832         case CEED_EVAL_CURL:
833           break;  // TODO: Not implemented
834                   // LCOV_EXCL_STOP
835       }
836     }
837     code << "\n      // -- Output fields\n";
838     for (CeedInt i = 0; i < num_output_fields; i++) {
839       const char *field_name;
840       std::string var_suffix = "_out_" + std::to_string(i);
841 
842       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
843       code << "      // ---- Output field " << i << ": " << field_name << "\n";
844       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
845       // Basis action
846       switch (eval_mode) {
847         case CEED_EVAL_NONE:
848           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
849           break;
850         case CEED_EVAL_INTERP:
851           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
852           break;
853         case CEED_EVAL_GRAD:
854           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
855           break;
856           // LCOV_EXCL_START
857         case CEED_EVAL_WEIGHT:
858           break;  // Should not occur
859         case CEED_EVAL_DIV:
860         case CEED_EVAL_CURL:
861           break;  // TODO: Not implemented
862                   // LCOV_EXCL_STOP
863       }
864     }
865   } else {
866     code << "\n    // Note: Using full elements\n";
867     code << "    {\n";
868     code << "      // -- Input fields\n";
869     for (CeedInt i = 0; i < num_input_fields; i++) {
870       const char *field_name;
871 
872       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
873       code << "      // ---- Input field " << i << ": " << field_name << "\n";
874       code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
875     }
876     code << "      // -- Output fields\n";
877     for (CeedInt i = 0; i < num_output_fields; i++) {
878       const char *field_name;
879 
880       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
881       code << "      // ---- Output field " << i << ": " << field_name << "\n";
882       code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
883     }
884   }
885 
886   // Input and output buffers
887   code << "\n      // -- QFunction inputs and outputs\n";
888   code << "      // ---- Inputs\n";
889   code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
890   for (CeedInt i = 0; i < num_input_fields; i++) {
891     const char *field_name;
892 
893     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
894     code << "      // ------ Input field " << i << ": " << field_name << "\n";
895     code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
896   }
897   code << "      // ---- Outputs\n";
898   code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
899   for (CeedInt i = 0; i < num_output_fields; i++) {
900     const char *field_name;
901 
902     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
903     code << "      // ------ Output field " << i << ": " << field_name << "\n";
904     code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
905   }
906 
907   // Apply QFunction
908   code << "\n      // -- Apply QFunction\n";
909   code << "      " << qfunction_name << "(ctx, ";
910   if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
911     code << "1";
912   } else {
913     code << Q_name;
914   }
915   code << ", inputs, outputs);\n";
916 
917   if (is_at_points) {
918     // Map back to coefficients
919     code << "\n      // -- Output fields\n";
920     for (CeedInt i = 0; i < num_output_fields; i++) {
921       const char *field_name;
922       std::string var_suffix = "_out_" + std::to_string(i);
923       std::string P_name     = "P_1d" + var_suffix;
924 
925       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
926       code << "      // ---- Output field " << i << ": " << field_name << "\n";
927       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
928       // Basis action
929       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
930       switch (eval_mode) {
931         case CEED_EVAL_NONE: {
932           CeedInt             comp_stride;
933           CeedElemRestriction elem_rstr;
934 
935           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
936           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
937           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
938           code << "      const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
939           code << "      WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
940                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
941                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
942           break;
943         }
944         case CEED_EVAL_INTERP:
945           code << "      if (i >= points.num_per_elem[elem]) {\n";
946           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
947           code << "      }\n";
948           code << "      InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
949                << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
950           break;
951         case CEED_EVAL_GRAD:
952           code << "      if (i >= points.num_per_elem[elem]) {\n";
953           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n";
954           code << "      }\n";
955           code << "      GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
956                << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
957           break;
958           // LCOV_EXCL_START
959         case CEED_EVAL_WEIGHT:
960           break;  // Should not occur
961         case CEED_EVAL_DIV:
962         case CEED_EVAL_CURL:
963           break;  // TODO: Not implemented
964                   // LCOV_EXCL_STOP
965       }
966     }
967   } else if (use_3d_slices) {
968     // Copy or apply transpose grad, if needed
969     code << "\n      // -- Output fields\n";
970     for (CeedInt i = 0; i < num_output_fields; i++) {
971       const char *field_name;
972       std::string var_suffix = "_out_" + std::to_string(i);
973       std::string P_name     = "P_1d" + var_suffix;
974 
975       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
976       code << "      // ---- Output field " << i << ": " << field_name << "\n";
977       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
978       // Basis action
979       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
980       switch (eval_mode) {
981         case CEED_EVAL_NONE:
982           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
983           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
984           code << "      }\n";
985           break;
986         case CEED_EVAL_INTERP:
987           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
988           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
989           code << "      }\n";
990           break;
991         case CEED_EVAL_GRAD:
992           code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G"
993                << var_suffix << ", r_q" << var_suffix << ");\n";
994           break;
995           // LCOV_EXCL_START
996         case CEED_EVAL_WEIGHT:
997           break;  // Should not occur
998         case CEED_EVAL_DIV:
999         case CEED_EVAL_CURL:
1000           break;  // TODO: Not implemented
1001                   // LCOV_EXCL_STOP
1002       }
1003     }
1004   }
1005   code << "    }\n";
1006   return CEED_ERROR_SUCCESS;
1007 }
1008 
1009 //------------------------------------------------------------------------------
1010 // Build single operator kernel
1011 //------------------------------------------------------------------------------
1012 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
1013   bool                   is_tensor = true, is_at_points = false, use_3d_slices = false;
1014   Ceed                   ceed;
1015   CeedInt                Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
1016   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1017   CeedQFunction_Hip_gen *qf_data;
1018   CeedQFunction          qf;
1019   CeedOperatorField     *op_input_fields, *op_output_fields;
1020   CeedOperator_Hip_gen  *data;
1021   std::ostringstream     code;
1022 
1023   CeedCallBackend(CeedOperatorGetData(op, &data));
1024   {
1025     bool is_setup_done;
1026 
1027     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
1028     if (is_setup_done) {
1029       *is_good_build = !data->use_fallback;
1030       return CEED_ERROR_SUCCESS;
1031     }
1032   }
1033 
1034   // Check field compatibility
1035   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1036   {
1037     bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
1038 
1039     for (CeedInt i = 0; i < num_input_fields; i++) {
1040       CeedBasis basis;
1041 
1042       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1043       if (basis != CEED_BASIS_NONE) {
1044         bool        is_tensor = true;
1045         const char *resource;
1046         char       *resource_root;
1047         Ceed        basis_ceed;
1048 
1049         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1050         is_all_tensor    = is_all_tensor && is_tensor;
1051         is_all_nontensor = is_all_nontensor && !is_tensor;
1052         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1053         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1054         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1055         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1056         CeedCallBackend(CeedFree(&resource_root));
1057         CeedCallBackend(CeedDestroy(&basis_ceed));
1058       }
1059       CeedCallBackend(CeedBasisDestroy(&basis));
1060     }
1061 
1062     for (CeedInt i = 0; i < num_output_fields; i++) {
1063       CeedBasis basis;
1064 
1065       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1066       if (basis != CEED_BASIS_NONE) {
1067         bool        is_tensor = true;
1068         const char *resource;
1069         char       *resource_root;
1070         Ceed        basis_ceed;
1071 
1072         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1073         is_all_tensor    = is_all_tensor && is_tensor;
1074         is_all_nontensor = is_all_nontensor && !is_tensor;
1075 
1076         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1077         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1078         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1079         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1080         CeedCallBackend(CeedFree(&resource_root));
1081         CeedCallBackend(CeedDestroy(&basis_ceed));
1082       }
1083       CeedCallBackend(CeedBasisDestroy(&basis));
1084     }
1085     // -- Fallback to ref if not all bases are shared
1086     if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
1087       *is_good_build = false;
1088       return CEED_ERROR_SUCCESS;
1089     }
1090   }
1091   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1092   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1093   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1094   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1095 
1096   // Get operator data
1097   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1098   CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
1099                                                       qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
1100   if (dim == 0) dim = 1;
1101   data->dim = dim;
1102   if (is_at_points) {
1103     CeedElemRestriction_Hip *rstr_data;
1104     CeedElemRestriction      rstr_points = NULL;
1105 
1106     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1107     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1108     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1109     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1110     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1111     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1112   }
1113   if (is_at_points) use_3d_slices = false;
1114   if (Q_1d == 0) {
1115     if (is_at_points) Q_1d = max_num_points;
1116     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1117   }
1118   data->Q_1d = Q_1d;
1119 
1120   // Check for restriction only identity operator
1121   {
1122     bool is_identity_qf;
1123 
1124     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1125     if (is_identity_qf) {
1126       CeedEvalMode eval_mode_in, eval_mode_out;
1127 
1128       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1129       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1130       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1131                 "Backend does not implement restriction only identity operators");
1132     }
1133   }
1134 
1135   // Load basis source files
1136   if (is_tensor) {
1137     code << "// Tensor basis source\n";
1138     code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1139   } else {
1140     code << "// Non-tensor basis source\n";
1141     code << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
1142   }
1143   if (is_at_points) {
1144     code << "// AtPoints basis source\n";
1145     code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1146   }
1147   code << "// CodeGen operator source\n";
1148   code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1149 
1150   // Get QFunction name
1151   std::string qfunction_name(qf_data->qfunction_name);
1152   std::string operator_name;
1153 
1154   operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
1155 
1156   // Define CEED_Q_VLA
1157   code << "\n#undef CEED_Q_VLA\n";
1158   if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
1159     code << "#define CEED_Q_VLA 1\n\n";
1160   } else {
1161     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1162   }
1163 
1164   // Add user QFunction source
1165   {
1166     const char *source_path;
1167 
1168     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1169     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1170 
1171     code << "// User QFunction source\n";
1172     code << "#include \"" << source_path << "\"\n\n";
1173   }
1174 
1175   // Setup
1176   code << "\n// -----------------------------------------------------------------------------\n";
1177   code << "// Operator Kernel\n";
1178   code << "// \n";
1179   code << "// d_[in,out]_i:   CeedVector device array\n";
1180   code << "// r_[in,out]_e_i: Element vector register\n";
1181   code << "// r_[in,out]_q_i: Quadrature space vector register\n";
1182   code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1183   code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1184   code << "// \n";
1185   code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1186   code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1187   code << "// -----------------------------------------------------------------------------\n";
1188   code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
1189   code << "__global__ void " << operator_name
1190        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
1191 
1192   // Scratch buffers
1193   for (CeedInt i = 0; i < num_input_fields; i++) {
1194     CeedEvalMode eval_mode;
1195 
1196     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1197     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1198       code << "  const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1199     }
1200   }
1201   for (CeedInt i = 0; i < num_output_fields; i++) {
1202     code << "  CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1203   }
1204 
1205   code << "  const CeedInt dim = " << dim << ";\n";
1206   code << "  const CeedInt " << (is_tensor ? "Q_1d" : "Q") << " = " << Q_1d << ";\n";
1207   if (is_at_points) {
1208     code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
1209     code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1210   }
1211 
1212   // Shared data
1213   code << "  extern __shared__ CeedScalar slice[];\n";
1214   code << "  SharedData_Hip data;\n";
1215   code << "  data.t_id_x = threadIdx.x;\n";
1216   code << "  data.t_id_y = threadIdx.y;\n";
1217   code << "  data.t_id_z = threadIdx.z;\n";
1218   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1219   code << "  data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
1220 
1221   // -- Determine input mat reuse
1222   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1223 
1224   for (CeedInt i = 0; i < num_input_fields; i++) {
1225     input_matrix_reuse[i].index = -1;
1226   }
1227   for (CeedInt i = 0; i < num_input_fields; i++) {
1228     CeedEvalMode eval_mode_i;
1229     CeedBasis    basis_i;
1230 
1231     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1232     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1233     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1234     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1235       CeedEvalMode eval_mode_j;
1236       CeedBasis    basis_j;
1237 
1238       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1239       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1240       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1241       if (basis_i == basis_j) {
1242         if (is_tensor) {
1243           input_matrix_reuse[i].index     = j;
1244           input_matrix_reuse[i].is_input  = true;
1245           input_matrix_reuse[i].eval_mode = eval_mode_j;
1246         } else {
1247           // For non-tensor can only re-use with the same eval mode
1248           if (eval_mode_i == eval_mode_j) {
1249             input_matrix_reuse[i].index     = j;
1250             input_matrix_reuse[i].is_input  = true;
1251             input_matrix_reuse[i].eval_mode = eval_mode_j;
1252           }
1253         }
1254       }
1255       CeedCallBackend(CeedBasisDestroy(&basis_j));
1256     }
1257     CeedCallBackend(CeedBasisDestroy(&basis_i));
1258   }
1259 
1260   // -- Determine output mat reuse
1261   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1262 
1263   for (CeedInt i = 0; i < num_output_fields; i++) {
1264     output_matrix_reuse[i].index = -1;
1265   }
1266   for (CeedInt i = 0; i < num_output_fields; i++) {
1267     CeedEvalMode eval_mode_i;
1268     CeedBasis    basis_i;
1269 
1270     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1271     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1272     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1273       CeedEvalMode eval_mode_j;
1274       CeedBasis    basis_j;
1275 
1276       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1277       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1278       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1279       if (basis_i == basis_j) {
1280         if (is_tensor) {
1281           output_matrix_reuse[i].index     = j;
1282           output_matrix_reuse[i].is_input  = true;
1283           output_matrix_reuse[i].eval_mode = eval_mode_j;
1284         } else {
1285           // For non-tensor can only re-use with the same eval mode
1286           if (eval_mode_i == eval_mode_j) {
1287             output_matrix_reuse[i].index     = j;
1288             output_matrix_reuse[i].is_input  = true;
1289             output_matrix_reuse[i].eval_mode = eval_mode_j;
1290           }
1291         }
1292       }
1293       CeedCallBackend(CeedBasisDestroy(&basis_j));
1294     }
1295     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1296       CeedEvalMode eval_mode_j;
1297       CeedBasis    basis_j;
1298 
1299       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1300       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1301       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1302       if (basis_i == basis_j) {
1303         if (is_tensor) {
1304           output_matrix_reuse[i].index     = j;
1305           output_matrix_reuse[i].is_input  = false;
1306           output_matrix_reuse[i].eval_mode = eval_mode_j;
1307         } else {
1308           // For non-tensor can only re-use with the same eval mode
1309           if (eval_mode_i == eval_mode_j) {
1310             output_matrix_reuse[i].index     = j;
1311             output_matrix_reuse[i].is_input  = false;
1312             output_matrix_reuse[i].eval_mode = eval_mode_j;
1313           }
1314         }
1315       }
1316       CeedCallBackend(CeedBasisDestroy(&basis_j));
1317     }
1318     CeedCallBackend(CeedBasisDestroy(&basis_i));
1319   }
1320 
1321   // Initialize constants, and matrices B and G
1322   code << "\n  // Input field constants and basis data\n";
1323   for (CeedInt i = 0; i < num_input_fields; i++) {
1324     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d, true,
1325                                                              is_tensor, is_at_points, use_3d_slices));
1326   }
1327   code << "\n  // Output field constants and basis data\n";
1328   for (CeedInt i = 0; i < num_output_fields; i++) {
1329     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
1330                                                              false, is_tensor, is_at_points, use_3d_slices));
1331   }
1332 
1333   // Loop over all elements
1334   code << "\n  // Element loop\n";
1335   code << "  __syncthreads();\n";
1336   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1337 
1338   // -- Compute minimum buffer space needed
1339   CeedInt max_rstr_buffer_size = 1;
1340 
1341   for (CeedInt i = 0; i < num_input_fields; i++) {
1342     CeedInt             num_comp, elem_size;
1343     CeedElemRestriction elem_rstr;
1344 
1345     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1346     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1347     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1348     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
1349     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1350   }
1351   for (CeedInt i = 0; i < num_output_fields; i++) {
1352     CeedInt             num_comp, elem_size;
1353     CeedElemRestriction elem_rstr;
1354 
1355     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1356     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1357     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1358     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
1359     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1360   }
1361   code << "    // Scratch restriction buffer space\n";
1362   code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1363 
1364   // -- Determine best input field processing order
1365   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1366 
1367   for (CeedInt i = 0; i < num_input_fields; i++) {
1368     field_rstr_in_buffer[i] = -1;
1369     input_field_order[i]    = -1;
1370   }
1371   {
1372     bool    is_ordered[CEED_FIELD_MAX];
1373     CeedInt curr_index = 0;
1374 
1375     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1376     for (CeedInt i = 0; i < num_input_fields; i++) {
1377       CeedVector          vec_i;
1378       CeedElemRestriction rstr_i;
1379 
1380       if (is_ordered[i]) continue;
1381       field_rstr_in_buffer[i]       = i;
1382       is_ordered[i]                 = true;
1383       input_field_order[curr_index] = i;
1384       curr_index++;
1385       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1386       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1387       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1388       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1389         CeedVector          vec_j;
1390         CeedElemRestriction rstr_j;
1391 
1392         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1393         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1394         if (rstr_i == rstr_j && vec_i == vec_j) {
1395           field_rstr_in_buffer[j]       = i;
1396           is_ordered[j]                 = true;
1397           input_field_order[curr_index] = j;
1398           curr_index++;
1399         }
1400         CeedCallBackend(CeedVectorDestroy(&vec_j));
1401         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1402       }
1403       CeedCallBackend(CeedVectorDestroy(&vec_i));
1404       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1405     }
1406   }
1407 
1408   // -- Input restriction and basis
1409   code << "\n    // -- Input field restrictions and basis actions\n";
1410   for (CeedInt i = 0; i < num_input_fields; i++) {
1411     const char   *field_name;
1412     const CeedInt f = input_field_order[i];
1413 
1414     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1415     code << "    // ---- Input field " << f << ": " << field_name << "\n";
1416 
1417     // ---- Restriction
1418     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], Q_1d,
1419                                                                true, is_tensor, is_at_points, use_3d_slices));
1420 
1421     // ---- Basis action
1422     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_tensor,
1423                                                          is_at_points, use_3d_slices));
1424   }
1425 
1426   // -- Q function
1427   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
1428                                                            num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_tensor,
1429                                                            is_at_points, use_3d_slices));
1430 
1431   // -- Output basis and restriction
1432   code << "\n    // -- Output field basis action and restrictions\n";
1433   for (CeedInt i = 0; i < num_output_fields; i++) {
1434     const char *field_name;
1435 
1436     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1437     code << "    // ---- Output field " << i << ": " << field_name << "\n";
1438 
1439     // ---- Basis action
1440     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
1441                                                          is_at_points, use_3d_slices));
1442 
1443     // ---- Restriction
1444     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
1445                                                                is_tensor, is_at_points, use_3d_slices));
1446   }
1447 
1448   // Close loop and function
1449   code << "  }\n";
1450   code << "}\n";
1451   code << "// -----------------------------------------------------------------------------\n\n";
1452 
1453   CeedInt block_sizes[3] = {0, 0, 0};
1454   CeedInt num_elem;
1455 
1456   // Compile
1457   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
1458   CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, data->max_P_1d, Q_1d, block_sizes));
1459   {
1460     bool is_compile_good = false;
1461 
1462     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE",
1463                                        block_sizes[0] * block_sizes[1] * block_sizes[2]));
1464     if (is_compile_good) {
1465       *is_good_build = true;
1466       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
1467     } else {
1468       *is_good_build     = false;
1469       data->use_fallback = true;
1470     }
1471   }
1472   CeedCallBackend(CeedOperatorSetSetupDone(op));
1473   CeedCallBackend(CeedDestroy(&ceed));
1474   CeedCallBackend(CeedQFunctionDestroy(&qf));
1475   return CEED_ERROR_SUCCESS;
1476 }
1477 
1478 //------------------------------------------------------------------------------
1479