xref: /libCEED/rust/libceed-sys/c-src/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 0ccda8ebe42db3fb90cdb724a58e4e5d2aedf1a1)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/gen-tools.h>
13 #include <ceed/jit-tools.h>
14 
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 
19 #include "../hip-ref/ceed-hip-ref.h"
20 #include "../hip-shared/ceed-hip-shared.h"
21 #include "../hip/ceed-hip-common.h"
22 #include "../hip/ceed-hip-compile.h"
23 #include "ceed-hip-gen.h"
24 
25 struct FieldReuse_Hip {
26   CeedInt      index;
27   bool         is_input;
28   CeedEvalMode eval_mode;
29 };
30 
31 //------------------------------------------------------------------------------
32 // Calculate the block size used for launching the operator kernel
33 //------------------------------------------------------------------------------
34 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
35   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
36   if (dim == 1) {
37     CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
38 
39     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
40     block_sizes[0]  = thread_1d;
41     block_sizes[1]  = 1;
42     block_sizes[2]  = elems_per_block;
43   } else if (dim == 2) {
44     const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2;
45 
46     block_sizes[0] = thread_1d;
47     block_sizes[1] = thread_1d;
48     block_sizes[2] = elems_per_block;
49   } else if (dim == 3) {
50     const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1);
51 
52     block_sizes[0] = thread_1d;
53     block_sizes[1] = thread_1d;
54     block_sizes[2] = elems_per_block;
55   }
56   return CEED_ERROR_SUCCESS;
57 }
58 
59 //------------------------------------------------------------------------------
60 // Determine type of operator
61 //------------------------------------------------------------------------------
62 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
63                                                CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
64                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
65                                                CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
66   // Check if all are tensor
67   *is_all_tensor = true;
68   for (CeedInt i = 0; i < num_input_fields; i++) {
69     CeedBasis basis;
70 
71     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
72     if (basis != CEED_BASIS_NONE) {
73       bool is_field_tensor;
74 
75       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
76       *is_all_tensor = *is_all_tensor && is_field_tensor;
77     }
78     CeedCallBackend(CeedBasisDestroy(&basis));
79   }
80   for (CeedInt i = 0; i < num_output_fields; i++) {
81     CeedBasis basis;
82 
83     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
84     if (basis != CEED_BASIS_NONE) {
85       bool is_field_tensor;
86 
87       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
88       *is_all_tensor = *is_all_tensor && is_field_tensor;
89     }
90     CeedCallBackend(CeedBasisDestroy(&basis));
91   }
92 
93   // Find max_P, max_P_1d, Q, and Q_1d
94   bool is_all_3d = true;
95 
96   *max_P    = 0;
97   *max_P_1d = 0;
98   *Q        = 0;
99   *Q_1d     = 0;
100   for (CeedInt i = 0; i < num_input_fields; i++) {
101     CeedBasis basis;
102 
103     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
104     if (basis != CEED_BASIS_NONE) {
105       bool    is_field_tensor;
106       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
107 
108       // Check if 3D
109       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
110       is_all_3d = is_all_3d && (field_dim == 3);
111       *max_dim  = CeedIntMax(*max_dim, field_dim);
112 
113       // Collect P, P_1d, Q, and Q_1d
114       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
115       *max_P = CeedIntMax(*max_P, field_P);
116       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
117       if (is_field_tensor) {
118         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
119         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
120       }
121       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
122       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
123       *Q = field_Q;
124       if (is_field_tensor) {
125         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
126         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
127         *Q_1d = field_Q_1d;
128       }
129     }
130     CeedCallBackend(CeedBasisDestroy(&basis));
131   }
132   for (CeedInt i = 0; i < num_output_fields; i++) {
133     CeedBasis basis;
134 
135     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
136     if (basis != CEED_BASIS_NONE) {
137       bool    is_field_tensor;
138       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
139 
140       // Check if 3D
141       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
142       is_all_3d = is_all_3d && (field_dim == 3);
143       *max_dim  = CeedIntMax(*max_dim, field_dim);
144 
145       // Collect P, P_1d, Q, and Q_1d
146       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
147       *max_P = CeedIntMax(*max_P, field_P);
148       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
149       if (is_field_tensor) {
150         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
151         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
152       }
153       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
154       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
155       *Q = field_Q;
156       if (is_field_tensor) {
157         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
158         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
159         *Q_1d = field_Q_1d;
160       }
161     }
162     CeedCallBackend(CeedBasisDestroy(&basis));
163   }
164 
165   // Only use 3D collocated gradient parallelization strategy when gradient is computed
166   *use_3d_slices = false;
167   if (is_all_3d && *is_all_tensor) {
168     bool was_grad_found = false;
169 
170     for (CeedInt i = 0; i < num_input_fields; i++) {
171       CeedEvalMode eval_mode;
172 
173       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
174       if (eval_mode == CEED_EVAL_GRAD) {
175         CeedBasis_Hip_shared *basis_data;
176         CeedBasis             basis;
177 
178         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
179         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
180         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
181         was_grad_found = true;
182         CeedCallBackend(CeedBasisDestroy(&basis));
183       }
184     }
185     for (CeedInt i = 0; i < num_output_fields; i++) {
186       CeedEvalMode eval_mode;
187 
188       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
189       if (eval_mode == CEED_EVAL_GRAD) {
190         CeedBasis_Hip_shared *basis_data;
191         CeedBasis             basis;
192 
193         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
194         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
195         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
196         was_grad_found = true;
197         CeedCallBackend(CeedBasisDestroy(&basis));
198       }
199     }
200   }
201   return CEED_ERROR_SUCCESS;
202 }
203 
204 //------------------------------------------------------------------------------
205 // Setup fields
206 //------------------------------------------------------------------------------
207 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
208                                                     CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
209                                                     CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
210                                                     bool use_3d_slices) {
211   bool      is_tensor = true;
212   CeedBasis basis;
213   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
214   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
215 
216   const char           *field_name;
217   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
218   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
219   std::string           option_name = (is_input ? "inputs" : "outputs");
220   CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
221   CeedInt               elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
222   CeedElemRestriction   elem_rstr;
223   CeedBasis_Hip_shared *basis_data;
224 
225   // Field reuse info
226   bool use_previous_field = field_reuse.index != -1;
227 
228   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
229   code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
230 
231   // Get field data
232   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
233   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
234     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
235     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
236   }
237   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
238   if (basis != CEED_BASIS_NONE) {
239     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
240     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
241     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
242     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
243   }
244   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
245 
246   // Set field constants
247   code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
248   if (is_tensor && !is_all_tensor) {
249     CeedInt P = 0;
250 
251     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
252     code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
253   }
254   code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
255   if (eval_mode != CEED_EVAL_WEIGHT) {
256     code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
257   }
258 
259   // Load basis data
260   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
261   switch (eval_mode) {
262     case CEED_EVAL_NONE:
263       break;
264     case CEED_EVAL_INTERP:
265       if (is_at_points) {
266         // AtPoints
267         if (!basis_data->d_chebyshev_interp_1d) {
268           CeedSize    interp_bytes;
269           CeedScalar *chebyshev_interp_1d;
270 
271           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
272           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
273           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
274           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
275           CeedCallHip(CeedBasisReturnCeed(basis),
276                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
277           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
278         }
279         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
280         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
281       } else {
282         // Standard quadrature
283         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
284         else data->B.outputs[i] = basis_data->d_interp_1d;
285       }
286       if (use_previous_field) {
287         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
288 
289         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
290       } else {
291         bool is_collocated = false;
292 
293         CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
294         if (is_collocated && !is_at_points) {
295           code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
296         } else {
297           code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
298           code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
299         }
300       }
301       break;
302     case CEED_EVAL_GRAD:
303       if (is_at_points) {
304         // AtPoints
305         if (!basis_data->d_chebyshev_interp_1d) {
306           CeedSize    interp_bytes;
307           CeedScalar *chebyshev_interp_1d;
308 
309           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
310           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
311           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
312           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
313           CeedCallHip(CeedBasisReturnCeed(basis),
314                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
315           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
316         }
317         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
318         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
319       } else {
320         // Standard quadrature
321         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
322         else data->B.outputs[i] = basis_data->d_interp_1d;
323       }
324       if (is_tensor) {
325         if (use_previous_field) {
326           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
327 
328           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
329         } else {
330           bool is_collocated = false;
331 
332           CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
333           if (is_collocated && !is_at_points) {
334             code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
335           } else {
336             code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
337             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
338           }
339         }
340       }
341       if (is_at_points) break;  // No G mat for AtPoints
342       if (use_3d_slices) {
343         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
344         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
345         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
346           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
347 
348           code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
349         } else {
350           code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
351           code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
352         }
353       } else {
354         bool has_collo_grad = basis_data->d_collo_grad_1d;
355 
356         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
357         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
358         if (has_collo_grad) {
359           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
360             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
361 
362             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
363           } else {
364             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
365             code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
366           }
367         } else {
368           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
369             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
370 
371             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
372           } else {
373             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
374                  << (is_tensor ? "" : var_suffix) << "];\n";
375             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
376                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
377           }
378         }
379       }
380       break;
381     case CEED_EVAL_WEIGHT:
382       break;  // No action
383       // LCOV_EXCL_START
384     case CEED_EVAL_DIV:
385     case CEED_EVAL_CURL:
386       break;  // TODO: Not implemented
387               // LCOV_EXCL_STOP
388   }
389   CeedCallBackend(CeedBasisDestroy(&basis));
390   return CEED_ERROR_SUCCESS;
391 }
392 
393 //------------------------------------------------------------------------------
394 // Restriction
395 //------------------------------------------------------------------------------
396 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
397                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
398                                                       CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
399                                                       bool use_3d_slices) {
400   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
401   std::string              P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
402   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
403   CeedInt                  elem_size = 0, num_comp = 0;
404   CeedSize                 l_size;
405   CeedRestrictionType      rstr_type = CEED_RESTRICTION_STANDARD;
406   CeedElemRestriction_Hip *rstr_data;
407   CeedElemRestriction      elem_rstr;
408 
409   // Get field data
410   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
411   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
412     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
413     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
414     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
415     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
416   }
417   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
418 
419   // Restriction
420   if (is_input) {
421     // Input
422     if (field_input_buffer[i] != i) {
423       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
424 
425       // Restriction was already done for previous input
426       code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
427     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
428       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
429         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
430         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
431       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
432         // Otherwise we're using the scratch space
433         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
434       }
435       switch (rstr_type) {
436         case CEED_RESTRICTION_STANDARD: {
437           CeedInt comp_stride;
438 
439           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
440           code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
441           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
442           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
443           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
444           code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
445                << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
446                << ");\n";
447           break;
448         }
449         case CEED_RESTRICTION_STRIDED: {
450           bool    has_backend_strides;
451           CeedInt num_elem;
452 
453           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
454           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
455           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
456 
457           if (!has_backend_strides) {
458             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
459           }
460           code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
461                << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
462           code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
463                << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
464                << var_suffix << ");\n";
465           break;
466         }
467         case CEED_RESTRICTION_POINTS: {
468           CeedInt comp_stride;
469 
470           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
471           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
472           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
473           break;
474         }
475         // LCOV_EXCL_START
476         case CEED_RESTRICTION_ORIENTED:
477         case CEED_RESTRICTION_CURL_ORIENTED:
478           break;  // TODO: Not implemented
479                   // LCOV_EXCL_STOP
480       }
481     }
482   } else {
483     // Output
484     switch (rstr_type) {
485       case CEED_RESTRICTION_STANDARD: {
486         CeedInt comp_stride;
487 
488         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
489         code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
490         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
491         code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
492         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
493         code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
494              << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
495              << ");\n";
496         break;
497       }
498       case CEED_RESTRICTION_STRIDED: {
499         bool    has_backend_strides;
500         CeedInt num_elem;
501 
502         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
503         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
504         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
505 
506         if (!has_backend_strides) {
507           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
508         }
509         code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
510              << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
511         code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
512              << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
513              << ");\n";
514         break;
515       }
516       case CEED_RESTRICTION_POINTS:
517         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
518         break;
519       // LCOV_EXCL_START
520       case CEED_RESTRICTION_ORIENTED:
521       case CEED_RESTRICTION_CURL_ORIENTED:
522         break;  // TODO: Not implemented
523                 // LCOV_EXCL_STOP
524     }
525   }
526   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
527   return CEED_ERROR_SUCCESS;
528 }
529 
530 //------------------------------------------------------------------------------
531 // Basis
532 //------------------------------------------------------------------------------
533 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field,
534                                                 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
535                                                 bool is_at_points, bool use_3d_slices) {
536   bool      is_tensor = true, is_collocated = true;
537   CeedBasis basis;
538   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
539   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
540   CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
541 
542   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
543   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
544   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
545   CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
546   CeedElemRestriction elem_rstr;
547 
548   // Get field data
549   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
550   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
551     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
552     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
553   }
554   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
555   if (basis != CEED_BASIS_NONE) {
556     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
557     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
558     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
559   }
560   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
561 
562   // Basis
563   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
564   if (is_input) {
565     switch (eval_mode) {
566       case CEED_EVAL_NONE:
567         if (!use_3d_slices && !is_at_points) {
568           code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
569         }
570         break;
571       case CEED_EVAL_INTERP:
572         if (is_at_points) {
573           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
574 
575           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
576           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
577                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
578         } else {
579           std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
580                                                    std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
581                                                 : "InterpNonTensor";
582           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
583 
584           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
585           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
586                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
587         }
588         break;
589       case CEED_EVAL_GRAD:
590         if (is_at_points) {
591           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
592 
593           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
594           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
595                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
596         } else if (use_3d_slices) {
597           std::string function_name =
598               (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d";
599 
600           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
601           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
602                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
603         } else if (is_tensor) {
604           bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
605           std::string function_name =
606               (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
607               std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
608           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
609 
610           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
611                << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
612           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
613                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
614         } else {
615           std::string function_name = "GradNonTensor";
616 
617           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
618           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
619                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
620         }
621         break;
622       case CEED_EVAL_WEIGHT: {
623         if (is_at_points) {
624           code << tab << "// Nothing to do AtPoints\n";
625         } else {
626           CeedBasis_Hip_shared *basis_data;
627           std::string           function_name = is_tensor
628                                                     ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
629                                                     : "WeightNonTensor";
630 
631           code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
632           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
633           data->W = basis_data->d_q_weight_1d;
634           code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
635         }
636         break;
637       }
638       // LCOV_EXCL_START
639       case CEED_EVAL_DIV:
640       case CEED_EVAL_CURL:
641         break;  // TODO: Not implemented
642                 // LCOV_EXCL_STOP
643     }
644   } else {
645     switch (eval_mode) {
646       case CEED_EVAL_NONE:
647         code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
648         break;  // No action
649       case CEED_EVAL_INTERP:
650         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
651         if (is_at_points) {
652           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
653 
654           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
655                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
656         } else {
657           std::string function_name =
658               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
659                            std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
660                         : "InterpTransposeNonTensor";
661           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
662 
663           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
664                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
665         }
666         break;
667       case CEED_EVAL_GRAD:
668         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
669         if (is_at_points) {
670           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
671 
672           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
673                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
674         } else if (use_3d_slices) {
675           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
676                                       std::to_string(dim) + "d";
677 
678           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
679                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
680         } else if (is_tensor) {
681           bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
682           std::string function_name =
683               (dim == 1 ? "GradTranspose"
684                         : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
685               std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
686           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
687 
688           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
689                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
690         } else {
691           std::string function_name = "GradTransposeNonTensor";
692 
693           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
694                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
695         }
696         break;
697       // LCOV_EXCL_START
698       case CEED_EVAL_WEIGHT:
699         break;  // Should not occur
700       case CEED_EVAL_DIV:
701       case CEED_EVAL_CURL:
702         break;  // TODO: Not implemented
703                 // LCOV_EXCL_STOP
704     }
705   }
706   CeedCallBackend(CeedBasisDestroy(&basis));
707   return CEED_ERROR_SUCCESS;
708 }
709 
710 //------------------------------------------------------------------------------
711 // QFunction
712 //------------------------------------------------------------------------------
713 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim,
714                                                     CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
715                                                     CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
716                                                     CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
717                                                     std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
718                                                     bool use_3d_slices) {
719   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
720   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
721   CeedElemRestriction elem_rstr;
722 
723   // Setup output arrays
724   code << "\n";
725   code << tab << "// -- Output field setup\n";
726   for (CeedInt i = 0; i < num_output_fields; i++) {
727     const char *field_name;
728     std::string var_suffix = "_out_" + std::to_string(i);
729 
730     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
731     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
732     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
733     switch (eval_mode) {
734       case CEED_EVAL_NONE:
735         if (is_at_points) {
736           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
737         } else {
738           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
739                << "];\n";
740         }
741         break;
742       case CEED_EVAL_INTERP:
743         if (is_at_points) {
744           // Accumulator for point data
745           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
746           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
747                << "[i] = 0.0;\n";
748         } else {
749           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
750                << "];\n";
751         }
752         break;
753       case CEED_EVAL_GRAD:
754         if (is_at_points) {
755           // Accumulator for point data
756           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
757           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
758                << "[i] = 0.0;\n";
759         } else if (use_3d_slices) {
760           // Accumulator for gradient slices
761           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
762           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
763         } else {
764           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
765                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
766         }
767         break;
768       case CEED_EVAL_WEIGHT:
769         break;
770         // LCOV_EXCL_START
771       case CEED_EVAL_DIV:
772       case CEED_EVAL_CURL:
773         break;  // TODO: Not implemented
774                 // LCOV_EXCL_STOP
775     }
776   }
777 
778   if (is_at_points) {
779     // We need to handle batches of points
780     code << "\n";
781     code << tab << "// Note: Using batches of points\n";
782     code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
783     code << tab << "#pragma unroll\n";
784     code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
785     tab.push();
786     code << tab << "const CeedInt p = i % max_num_points;\n\n";
787 
788     code << tab << "// -- Coordinates\n";
789     code << tab << "CeedScalar r_x[max_dim];\n";
790     code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
791 
792     code << tab << "// -- Input fields\n";
793     for (CeedInt i = 0; i < num_input_fields; i++) {
794       const char *field_name;
795       std::string var_suffix = "_in_" + std::to_string(i);
796       std::string P_name     = "P_1d" + var_suffix;
797 
798       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
799       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
800       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
801       // Basis action
802       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
803       switch (eval_mode) {
804         case CEED_EVAL_NONE:
805           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
806           code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
807                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
808           break;
809         case CEED_EVAL_INTERP:
810           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
811           code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
812                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
813           break;
814         case CEED_EVAL_GRAD:
815           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
816           code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
817                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
818           break;
819         case CEED_EVAL_WEIGHT:
820           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
821           code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
822           break;
823           // LCOV_EXCL_START
824         case CEED_EVAL_DIV:
825         case CEED_EVAL_CURL:
826           break;  // TODO: Not implemented
827                   // LCOV_EXCL_STOP
828       }
829     }
830     code << "\n";
831     code << tab << "// -- Output fields\n";
832     for (CeedInt i = 0; i < num_output_fields; i++) {
833       const char *field_name;
834       std::string var_suffix = "_out_" + std::to_string(i);
835 
836       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
837       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
838       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
839       // Basis action
840       switch (eval_mode) {
841         case CEED_EVAL_NONE:
842           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
843           break;
844         case CEED_EVAL_INTERP:
845           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
846           break;
847         case CEED_EVAL_GRAD:
848           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
849           break;
850           // LCOV_EXCL_START
851         case CEED_EVAL_WEIGHT:
852           break;  // Should not occur
853         case CEED_EVAL_DIV:
854         case CEED_EVAL_CURL:
855           break;  // TODO: Not implemented
856                   // LCOV_EXCL_STOP
857       }
858     }
859 
860   } else if (use_3d_slices) {
861     // We treat quadrature points per slice in 3d to save registers
862     code << "\n";
863     code << tab << "// Note: Using planes of 3D elements\n";
864     code << tab << "#pragma unroll\n";
865     code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
866     tab.push();
867     code << tab << "// -- Input fields\n";
868     for (CeedInt i = 0; i < num_input_fields; i++) {
869       const char *field_name;
870       std::string var_suffix = "_in_" + std::to_string(i);
871 
872       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
873       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
874       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
875       // Basis action
876       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
877       switch (eval_mode) {
878         case CEED_EVAL_NONE:
879           bool is_strided;
880 
881           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
882 
883           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
884           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
885           if (is_strided) {
886             bool    has_backend_strides;
887             CeedInt num_elem, elem_size;
888 
889             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
890             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
891             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
892             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
893 
894             if (!has_backend_strides) {
895               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
896             }
897             code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
898                  << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
899             code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
900                  << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
901           } else {
902             CeedSize                 l_size = 0;
903             CeedInt                  comp_stride;
904             CeedElemRestriction_Hip *rstr_data;
905 
906             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
907             code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
908             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
909             code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
910             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
911             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
912             code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
913                  << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
914           }
915           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
916           break;
917         case CEED_EVAL_INTERP:
918           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
919           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
920           tab.push();
921           code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
922           tab.pop();
923           code << tab << "}\n";
924           break;
925         case CEED_EVAL_GRAD:
926           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
927           code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
928                << var_suffix << ", r_s" << var_suffix << ");\n";
929           break;
930         case CEED_EVAL_WEIGHT:
931           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
932           code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
933           break;
934           // LCOV_EXCL_START
935         case CEED_EVAL_DIV:
936         case CEED_EVAL_CURL:
937           break;  // TODO: Not implemented
938                   // LCOV_EXCL_STOP
939       }
940     }
941     code << "\n";
942     code << tab << "// -- Output fields\n";
943     for (CeedInt i = 0; i < num_output_fields; i++) {
944       const char *field_name;
945       std::string var_suffix = "_out_" + std::to_string(i);
946 
947       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
948       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
949       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
950       // Basis action
951       switch (eval_mode) {
952         case CEED_EVAL_NONE:
953           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
954           break;
955         case CEED_EVAL_INTERP:
956           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
957           break;
958         case CEED_EVAL_GRAD:
959           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
960           break;
961           // LCOV_EXCL_START
962         case CEED_EVAL_WEIGHT:
963           break;  // Should not occur
964         case CEED_EVAL_DIV:
965         case CEED_EVAL_CURL:
966           break;  // TODO: Not implemented
967                   // LCOV_EXCL_STOP
968       }
969     }
970   } else {
971     code << "\n";
972     code << tab << "// Note: Using full elements\n";
973     code << tab << "{\n";
974     tab.push();
975     code << tab << "// -- Input fields\n";
976     for (CeedInt i = 0; i < num_input_fields; i++) {
977       const char *field_name;
978 
979       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
980       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
981       code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
982     }
983     code << tab << "// -- Output fields\n";
984     for (CeedInt i = 0; i < num_output_fields; i++) {
985       const char *field_name;
986 
987       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
988       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
989       code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
990     }
991   }
992 
993   // Input and output buffers
994   code << "\n";
995   code << tab << "// -- QFunction inputs and outputs\n";
996   code << tab << "// ---- Inputs\n";
997   code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
998   for (CeedInt i = 0; i < num_input_fields; i++) {
999     const char *field_name;
1000 
1001     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
1002     code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
1003     code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
1004   }
1005   code << tab << "// ---- Outputs\n";
1006   code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
1007   for (CeedInt i = 0; i < num_output_fields; i++) {
1008     const char *field_name;
1009 
1010     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1011     code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
1012     code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
1013   }
1014 
1015   // Apply QFunction
1016   code << "\n";
1017   code << tab << "// -- Apply QFunction\n";
1018   code << tab << "" << qfunction_name << "(ctx, ";
1019   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1020     code << "1";
1021   } else {
1022     code << Q_name;
1023   }
1024   code << ", inputs, outputs);\n";
1025 
1026   if (is_at_points) {
1027     // Map back to coefficients
1028     code << "\n";
1029     code << tab << "// -- Output fields\n";
1030     for (CeedInt i = 0; i < num_output_fields; i++) {
1031       const char *field_name;
1032       std::string var_suffix = "_out_" + std::to_string(i);
1033       std::string P_name     = "P_1d" + var_suffix;
1034 
1035       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1036       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1037       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1038       // Basis action
1039       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1040       switch (eval_mode) {
1041         case CEED_EVAL_NONE: {
1042           CeedInt             comp_stride;
1043           CeedElemRestriction elem_rstr;
1044 
1045           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1046           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
1047           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1048           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
1049           code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
1050                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
1051                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
1052           break;
1053         }
1054         case CEED_EVAL_INTERP:
1055           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1056           tab.push();
1057           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1058           tab.pop();
1059           code << tab << "}\n";
1060           code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1061                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1062           break;
1063         case CEED_EVAL_GRAD:
1064           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1065           tab.push();
1066           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1067           tab.pop();
1068           code << tab << "}\n";
1069           code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1070                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1071           break;
1072           // LCOV_EXCL_START
1073         case CEED_EVAL_WEIGHT:
1074           break;  // Should not occur
1075         case CEED_EVAL_DIV:
1076         case CEED_EVAL_CURL:
1077           break;  // TODO: Not implemented
1078                   // LCOV_EXCL_STOP
1079       }
1080     }
1081   } else if (use_3d_slices) {
1082     // Copy or apply transpose grad, if needed
1083     code << "\n";
1084     code << tab << "// -- Output fields\n";
1085     for (CeedInt i = 0; i < num_output_fields; i++) {
1086       const char *field_name;
1087       std::string var_suffix = "_out_" + std::to_string(i);
1088       std::string P_name     = "P_1d" + var_suffix;
1089 
1090       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1091       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1092       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1093       // Basis action
1094       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1095       switch (eval_mode) {
1096         case CEED_EVAL_NONE:
1097           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1098           tab.push();
1099           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1100           tab.pop();
1101           code << tab << "}\n";
1102           break;
1103         case CEED_EVAL_INTERP:
1104           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1105           tab.push();
1106           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1107           tab.pop();
1108           code << tab << "}\n";
1109           break;
1110         case CEED_EVAL_GRAD:
1111           code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1112                << var_suffix << ", r_q" << var_suffix << ");\n";
1113           break;
1114           // LCOV_EXCL_START
1115         case CEED_EVAL_WEIGHT:
1116           break;  // Should not occur
1117         case CEED_EVAL_DIV:
1118         case CEED_EVAL_CURL:
1119           break;  // TODO: Not implemented
1120                   // LCOV_EXCL_STOP
1121       }
1122     }
1123   }
1124   tab.pop();
1125   code << tab << "}\n";
1126   return CEED_ERROR_SUCCESS;
1127 }
1128 
1129 //------------------------------------------------------------------------------
1130 // Build single operator kernel
1131 //------------------------------------------------------------------------------
1132 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
1133   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
1134   Ceed                   ceed;
1135   CeedInt                Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1136   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1137   CeedQFunction_Hip_gen *qf_data;
1138   CeedQFunction          qf;
1139   CeedOperatorField     *op_input_fields, *op_output_fields;
1140   CeedOperator_Hip_gen  *data;
1141   std::ostringstream     code;
1142   Tab                    tab;
1143 
1144   CeedCallBackend(CeedOperatorGetData(op, &data));
1145   {
1146     bool is_setup_done;
1147 
1148     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
1149     if (is_setup_done) {
1150       *is_good_build = !data->use_fallback;
1151       return CEED_ERROR_SUCCESS;
1152     }
1153   }
1154 
1155   // Check field compatibility
1156   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1157   {
1158     bool has_shared_bases = true;
1159 
1160     for (CeedInt i = 0; i < num_input_fields; i++) {
1161       CeedBasis basis;
1162 
1163       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1164       if (basis != CEED_BASIS_NONE) {
1165         bool        is_tensor = true;
1166         const char *resource;
1167         char       *resource_root;
1168         Ceed        basis_ceed;
1169 
1170         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1171         is_all_tensor    = is_all_tensor && is_tensor;
1172         is_all_nontensor = is_all_nontensor && !is_tensor;
1173         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1174         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1175         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1176         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1177         CeedCallBackend(CeedFree(&resource_root));
1178         CeedCallBackend(CeedDestroy(&basis_ceed));
1179       }
1180       CeedCallBackend(CeedBasisDestroy(&basis));
1181     }
1182 
1183     for (CeedInt i = 0; i < num_output_fields; i++) {
1184       CeedBasis basis;
1185 
1186       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1187       if (basis != CEED_BASIS_NONE) {
1188         bool        is_tensor = true;
1189         const char *resource;
1190         char       *resource_root;
1191         Ceed        basis_ceed;
1192 
1193         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1194         is_all_tensor    = is_all_tensor && is_tensor;
1195         is_all_nontensor = is_all_nontensor && !is_tensor;
1196 
1197         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1198         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1199         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1200         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1201         CeedCallBackend(CeedFree(&resource_root));
1202         CeedCallBackend(CeedDestroy(&basis_ceed));
1203       }
1204       CeedCallBackend(CeedBasisDestroy(&basis));
1205     }
1206     // -- Fallback to ref if not all bases are shared
1207     if (!has_shared_bases) {
1208       *is_good_build = false;
1209       return CEED_ERROR_SUCCESS;
1210     }
1211   }
1212   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1213   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1214   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1215   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1216 
1217   // Get operator data
1218   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1219   {
1220     CeedInt max_P = 0, max_P_1d = 0;
1221 
1222     CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
1223                                                         qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));
1224     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
1225   }
1226   if (max_dim == 0) max_dim = 1;
1227   data->dim = max_dim;
1228   if (is_at_points) {
1229     CeedElemRestriction_Hip *rstr_data;
1230     CeedElemRestriction      rstr_points = NULL;
1231 
1232     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1233     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1234     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1235     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1236     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1237     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1238   }
1239   if (is_at_points) use_3d_slices = false;
1240   if (Q_1d == 0) {
1241     if (is_at_points) Q_1d = max_num_points;
1242     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1243   }
1244   if (Q == 0) Q = Q_1d;
1245   data->Q    = Q;
1246   data->Q_1d = Q_1d;
1247 
1248   // Check for restriction only identity operator
1249   {
1250     bool is_identity_qf;
1251 
1252     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1253     if (is_identity_qf) {
1254       CeedEvalMode eval_mode_in, eval_mode_out;
1255 
1256       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1257       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1258       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1259                 "Backend does not implement restriction only identity operators");
1260     }
1261   }
1262 
1263   // Load basis source files
1264   if (!is_all_nontensor) {
1265     code << tab << "// Tensor basis source\n";
1266     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1267   }
1268   if (!is_all_tensor) {
1269     code << tab << "// Non-tensor basis source\n";
1270     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
1271   }
1272   if (is_at_points) {
1273     code << tab << "// AtPoints basis source\n";
1274     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1275   }
1276   if (!is_all_tensor && !is_all_nontensor) {
1277     code << tab << "// Tensor basis source\n";
1278     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
1279   }
1280   code << tab << "// CodeGen operator source\n";
1281   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1282 
1283   // Get QFunction name
1284   std::string qfunction_name(qf_data->qfunction_name);
1285   std::string operator_name;
1286 
1287   operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
1288 
1289   // Define CEED_Q_VLA
1290   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1291   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1292     code << tab << "#define CEED_Q_VLA 1\n\n";
1293   } else {
1294     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1295   }
1296 
1297   // Add user QFunction source
1298   {
1299     const char *source_path;
1300 
1301     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1302     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1303 
1304     code << tab << "// User QFunction source\n";
1305     code << tab << "#include \"" << source_path << "\"\n\n";
1306   }
1307 
1308   // Setup
1309   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1310   code << tab << "// Operator Kernel\n";
1311   code << tab << "// \n";
1312   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1313   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1314   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1315   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1316   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1317   code << tab << "// \n";
1318   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1319   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1320   code << tab << "// -----------------------------------------------------------------------------\n";
1321   code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
1322   code << "__global__ void " << operator_name
1323        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
1324   tab.push();
1325 
1326   // Scratch buffers
1327   for (CeedInt i = 0; i < num_input_fields; i++) {
1328     CeedEvalMode eval_mode;
1329 
1330     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1331     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1332       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1333     }
1334   }
1335   for (CeedInt i = 0; i < num_output_fields; i++) {
1336     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1337   }
1338 
1339   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1340   if (!is_all_tensor) {
1341     code << tab << "const CeedInt Q = " << Q << ";\n";
1342   }
1343   if (!is_all_nontensor) {
1344     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1345   }
1346   if (is_at_points) {
1347     code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1348     code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1349   }
1350 
1351   // Shared data
1352   code << tab << "extern __shared__ CeedScalar slice[];\n";
1353   code << tab << "SharedData_Hip data;\n";
1354   code << tab << "data.t_id_x = threadIdx.x;\n";
1355   code << tab << "data.t_id_y = threadIdx.y;\n";
1356   code << tab << "data.t_id_z = threadIdx.z;\n";
1357   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1358   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1359 
1360   // -- Determine input mat reuse
1361   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1362 
1363   for (CeedInt i = 0; i < num_input_fields; i++) {
1364     input_matrix_reuse[i].index = -1;
1365   }
1366   for (CeedInt i = 0; i < num_input_fields; i++) {
1367     bool         is_tensor = true;
1368     CeedEvalMode eval_mode_i;
1369     CeedBasis    basis_i;
1370 
1371     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1372     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1373     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1374     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1375     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1376       CeedEvalMode eval_mode_j;
1377       CeedBasis    basis_j;
1378 
1379       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1380       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1381       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1382       if (basis_i == basis_j) {
1383         if (is_tensor) {
1384           input_matrix_reuse[i].index     = j;
1385           input_matrix_reuse[i].is_input  = true;
1386           input_matrix_reuse[i].eval_mode = eval_mode_j;
1387         } else {
1388           // For non-tensor can only re-use with the same eval mode
1389           if (eval_mode_i == eval_mode_j) {
1390             input_matrix_reuse[i].index     = j;
1391             input_matrix_reuse[i].is_input  = true;
1392             input_matrix_reuse[i].eval_mode = eval_mode_j;
1393           }
1394         }
1395       }
1396       CeedCallBackend(CeedBasisDestroy(&basis_j));
1397     }
1398     CeedCallBackend(CeedBasisDestroy(&basis_i));
1399   }
1400 
1401   // -- Determine output mat reuse
1402   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1403 
1404   for (CeedInt i = 0; i < num_output_fields; i++) {
1405     output_matrix_reuse[i].index = -1;
1406   }
1407   for (CeedInt i = 0; i < num_output_fields; i++) {
1408     bool         is_tensor = true;
1409     CeedEvalMode eval_mode_i;
1410     CeedBasis    basis_i;
1411 
1412     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1413     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1414     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1415       CeedEvalMode eval_mode_j;
1416       CeedBasis    basis_j;
1417 
1418       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1419       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1420       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1421       if (basis_i == basis_j) {
1422         if (is_tensor) {
1423           output_matrix_reuse[i].index     = j;
1424           output_matrix_reuse[i].is_input  = true;
1425           output_matrix_reuse[i].eval_mode = eval_mode_j;
1426         } else {
1427           // For non-tensor can only re-use with the same eval mode
1428           if (eval_mode_i == eval_mode_j) {
1429             output_matrix_reuse[i].index     = j;
1430             output_matrix_reuse[i].is_input  = true;
1431             output_matrix_reuse[i].eval_mode = eval_mode_j;
1432           }
1433         }
1434       }
1435       CeedCallBackend(CeedBasisDestroy(&basis_j));
1436     }
1437     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1438       CeedEvalMode eval_mode_j;
1439       CeedBasis    basis_j;
1440 
1441       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1442       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1443       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1444       CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1445       if (basis_i == basis_j) {
1446         if (is_tensor) {
1447           output_matrix_reuse[i].index     = j;
1448           output_matrix_reuse[i].is_input  = false;
1449           output_matrix_reuse[i].eval_mode = eval_mode_j;
1450         } else {
1451           // For non-tensor can only re-use with the same eval mode
1452           if (eval_mode_i == eval_mode_j) {
1453             output_matrix_reuse[i].index     = j;
1454             output_matrix_reuse[i].is_input  = false;
1455             output_matrix_reuse[i].eval_mode = eval_mode_j;
1456           }
1457         }
1458       }
1459       CeedCallBackend(CeedBasisDestroy(&basis_j));
1460     }
1461     CeedCallBackend(CeedBasisDestroy(&basis_i));
1462   }
1463 
1464   // Initialize constants, and matrices B and G
1465   code << "\n" << tab << "// Input field constants and basis data\n";
1466   for (CeedInt i = 0; i < num_input_fields; i++) {
1467     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1468                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1469   }
1470   code << "\n" << tab << "// Output field constants and basis data\n";
1471   for (CeedInt i = 0; i < num_output_fields; i++) {
1472     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1473                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1474   }
1475 
1476   // Loop over all elements
1477   code << "\n" << tab << "// Element loop\n";
1478   code << tab << "__syncthreads();\n";
1479   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1480   tab.push();
1481 
1482   // -- Compute minimum buffer space needed
1483   CeedInt max_rstr_buffer_size = 1;
1484 
1485   for (CeedInt i = 0; i < num_input_fields; i++) {
1486     CeedEvalMode eval_mode;
1487 
1488     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1489     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1490       CeedInt             num_comp;
1491       CeedElemRestriction elem_rstr;
1492 
1493       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1494       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1495       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1496       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1497     }
1498   }
1499   for (CeedInt i = 0; i < num_output_fields; i++) {
1500     CeedEvalMode eval_mode;
1501 
1502     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1503     if (eval_mode != CEED_EVAL_NONE) {
1504       CeedInt             num_comp;
1505       CeedElemRestriction elem_rstr;
1506 
1507       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1508       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1509       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1510       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1511     }
1512   }
1513   code << tab << "// Scratch restriction buffer space\n";
1514   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1515 
1516   // -- Determine best input field processing order
1517   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1518 
1519   for (CeedInt i = 0; i < num_input_fields; i++) {
1520     field_rstr_in_buffer[i] = -1;
1521     input_field_order[i]    = -1;
1522   }
1523   {
1524     bool    is_ordered[CEED_FIELD_MAX];
1525     CeedInt curr_index = 0;
1526 
1527     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1528     for (CeedInt i = 0; i < num_input_fields; i++) {
1529       CeedVector          vec_i;
1530       CeedElemRestriction rstr_i;
1531 
1532       if (is_ordered[i]) continue;
1533       field_rstr_in_buffer[i]       = i;
1534       is_ordered[i]                 = true;
1535       input_field_order[curr_index] = i;
1536       curr_index++;
1537       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1538       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1539       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1540       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1541         CeedVector          vec_j;
1542         CeedElemRestriction rstr_j;
1543 
1544         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1545         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1546         if (rstr_i == rstr_j && vec_i == vec_j) {
1547           field_rstr_in_buffer[j]       = i;
1548           is_ordered[j]                 = true;
1549           input_field_order[curr_index] = j;
1550           curr_index++;
1551         }
1552         CeedCallBackend(CeedVectorDestroy(&vec_j));
1553         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1554       }
1555       CeedCallBackend(CeedVectorDestroy(&vec_i));
1556       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1557     }
1558   }
1559 
1560   // -- Input restriction and basis
1561   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1562   for (CeedInt i = 0; i < num_input_fields; i++) {
1563     const char   *field_name;
1564     const CeedInt f = input_field_order[i];
1565 
1566     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1567     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1568 
1569     // ---- Restriction
1570     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1571                                                                max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1572 
1573     // ---- Basis action
1574     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1575                                                          is_all_tensor, is_at_points, use_3d_slices));
1576   }
1577 
1578   // -- Q function
1579   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1580                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1581                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
1582 
1583   // -- Output basis and restriction
1584   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
1585   for (CeedInt i = 0; i < num_output_fields; i++) {
1586     const char *field_name;
1587 
1588     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1589     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1590 
1591     // ---- Basis action
1592     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
1593                                                          is_all_tensor, is_at_points, use_3d_slices));
1594 
1595     // ---- Restriction
1596     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
1597                                                                false, is_all_tensor, is_at_points, use_3d_slices));
1598   }
1599 
1600   // Close loop and function
1601   tab.pop();
1602   code << tab << "}\n";
1603   tab.pop();
1604   code << tab << "}\n";
1605   code << tab << "// -----------------------------------------------------------------------------\n\n";
1606 
1607   CeedInt block_sizes[3] = {0, 0, 0};
1608   CeedInt num_elem;
1609 
1610   // Compile
1611   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
1612   CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
1613   {
1614     bool is_compile_good = false;
1615 
1616     data->thread_1d = block_sizes[0];
1617     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE",
1618                                        block_sizes[0] * block_sizes[1] * block_sizes[2]));
1619     if (is_compile_good) {
1620       *is_good_build = true;
1621       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
1622     } else {
1623       *is_good_build     = false;
1624       data->use_fallback = true;
1625     }
1626   }
1627   CeedCallBackend(CeedOperatorSetSetupDone(op));
1628   CeedCallBackend(CeedDestroy(&ceed));
1629   CeedCallBackend(CeedQFunctionDestroy(&qf));
1630   return CEED_ERROR_SUCCESS;
1631 }
1632 
1633 //------------------------------------------------------------------------------
1634 // Build AtPoints assembly operator kernel
1635 //------------------------------------------------------------------------------
1636 static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) {
1637   bool                   is_all_tensor = true, is_at_points = false, use_3d_slices = false;
1638   Ceed                   ceed;
1639   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1640   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1641   CeedQFunction_Hip_gen *qf_data;
1642   CeedQFunction          qf;
1643   CeedOperatorField     *op_input_fields, *op_output_fields;
1644   CeedOperator_Hip_gen  *data;
1645   std::ostringstream     code;
1646   Tab                    tab;
1647 
1648   // Check compatibility
1649   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1650   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1651   CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
1652 
1653   // Retrieve operator data
1654   CeedCallBackend(CeedOperatorGetData(op, &data));
1655   Q       = data->Q;
1656   Q_1d    = data->Q_1d;
1657   max_dim = data->dim;
1658   {
1659     CeedElemRestriction rstr_points = NULL;
1660 
1661     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1662     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1663     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1664     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1665   }
1666   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1667   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1668   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1669   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1670 
1671   // Load basis source files
1672   code << tab << "// Tensor basis source\n";
1673   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1674   code << tab << "// AtPoints basis source\n";
1675   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1676   code << tab << "// CodeGen operator source\n";
1677   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1678 
1679   // Get QFunction name
1680   std::string qfunction_name(qf_data->qfunction_name);
1681   std::string operator_name;
1682 
1683   if (is_full) {
1684     operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name;
1685   } else {
1686     operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name;
1687   }
1688 
1689   // Define CEED_Q_VLA
1690   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1691   code << tab << "#define CEED_Q_VLA 1\n\n";
1692 
1693   // Add user QFunction source
1694   {
1695     const char *source_path;
1696 
1697     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1698     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1699 
1700     code << tab << "// User QFunction source\n";
1701     code << tab << "#include \"" << source_path << "\"\n\n";
1702   }
1703 
1704   // Setup
1705   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1706   code << tab << "// Operator Assembly Kernel\n";
1707   code << tab << "// \n";
1708   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1709   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1710   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1711   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1712   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1713   code << tab << "// \n";
1714   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1715   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1716   code << tab << "// -----------------------------------------------------------------------------\n";
1717   code << tab << "extern \"C\" __global__ void " << operator_name
1718        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
1719           "points, CeedScalar *__restrict__ values_array) {\n";
1720   tab.push();
1721 
1722   // Scratch buffers
1723   for (CeedInt i = 0; i < num_input_fields; i++) {
1724     CeedEvalMode eval_mode;
1725 
1726     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1727     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1728       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1729     }
1730   }
1731   for (CeedInt i = 0; i < num_output_fields; i++) {
1732     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1733   }
1734 
1735   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1736   code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1737   code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1738   code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1739 
1740   // Shared data
1741   code << tab << "extern __shared__ CeedScalar slice[];\n";
1742   code << tab << "SharedData_Hip data;\n";
1743   code << tab << "data.t_id_x = threadIdx.x;\n";
1744   code << tab << "data.t_id_y = threadIdx.y;\n";
1745   code << tab << "data.t_id_z = threadIdx.z;\n";
1746   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1747   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1748 
1749   // -- Determine input mat reuse
1750   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1751 
1752   for (CeedInt i = 0; i < num_input_fields; i++) {
1753     input_matrix_reuse[i].index = -1;
1754   }
1755   for (CeedInt i = 0; i < num_input_fields; i++) {
1756     CeedEvalMode eval_mode_i;
1757     CeedBasis    basis_i;
1758 
1759     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1760     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1761     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1762     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1763       CeedEvalMode eval_mode_j;
1764       CeedBasis    basis_j;
1765 
1766       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1767       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1768       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1769       if (basis_i == basis_j) {
1770         input_matrix_reuse[i].index     = j;
1771         input_matrix_reuse[i].is_input  = true;
1772         input_matrix_reuse[i].eval_mode = eval_mode_j;
1773       }
1774       CeedCallBackend(CeedBasisDestroy(&basis_j));
1775     }
1776     CeedCallBackend(CeedBasisDestroy(&basis_i));
1777   }
1778 
1779   // -- Determine output mat reuse
1780   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1781 
1782   for (CeedInt i = 0; i < num_output_fields; i++) {
1783     output_matrix_reuse[i].index = -1;
1784   }
1785   for (CeedInt i = 0; i < num_output_fields; i++) {
1786     CeedEvalMode eval_mode_i;
1787     CeedBasis    basis_i;
1788 
1789     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1790     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1791     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1792       CeedEvalMode eval_mode_j;
1793       CeedBasis    basis_j;
1794 
1795       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1796       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1797       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1798       if (basis_i == basis_j) {
1799         output_matrix_reuse[i].index     = j;
1800         output_matrix_reuse[i].is_input  = true;
1801         output_matrix_reuse[i].eval_mode = eval_mode_j;
1802       }
1803       CeedCallBackend(CeedBasisDestroy(&basis_j));
1804     }
1805     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1806       CeedEvalMode eval_mode_j;
1807       CeedBasis    basis_j;
1808 
1809       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1810       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1811       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1812       if (basis_i == basis_j) {
1813         output_matrix_reuse[i].index     = j;
1814         output_matrix_reuse[i].is_input  = false;
1815         output_matrix_reuse[i].eval_mode = eval_mode_j;
1816       }
1817       CeedCallBackend(CeedBasisDestroy(&basis_j));
1818     }
1819     CeedCallBackend(CeedBasisDestroy(&basis_i));
1820   }
1821 
1822   // Initialize constants, and matrices B and G
1823   code << "\n" << tab << "// Input field constants and basis data\n";
1824   for (CeedInt i = 0; i < num_input_fields; i++) {
1825     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1826                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1827   }
1828   code << "\n" << tab << "// Output field constants and basis data\n";
1829   for (CeedInt i = 0; i < num_output_fields; i++) {
1830     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1831                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1832   }
1833 
1834   // Loop over all elements
1835   code << "\n" << tab << "// Element loop\n";
1836   code << tab << "__syncthreads();\n";
1837   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1838   tab.push();
1839 
1840   // -- Compute minimum buffer space needed
1841   CeedInt max_rstr_buffer_size = 1;
1842 
1843   for (CeedInt i = 0; i < num_input_fields; i++) {
1844     CeedEvalMode eval_mode;
1845 
1846     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1847     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1848       CeedInt             num_comp;
1849       CeedElemRestriction elem_rstr;
1850 
1851       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1852       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1853       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1854       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1855     }
1856   }
1857   for (CeedInt i = 0; i < num_output_fields; i++) {
1858     CeedEvalMode eval_mode;
1859 
1860     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1861     if (eval_mode != CEED_EVAL_NONE) {
1862       CeedInt             num_comp;
1863       CeedElemRestriction elem_rstr;
1864 
1865       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1866       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1867       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1868       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1869     }
1870   }
1871   code << tab << "// Scratch restriction buffer space\n";
1872   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1873 
1874   // -- Determine best input field processing order
1875   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1876 
1877   for (CeedInt i = 0; i < num_input_fields; i++) {
1878     field_rstr_in_buffer[i] = -1;
1879     input_field_order[i]    = -1;
1880   }
1881   {
1882     bool    is_ordered[CEED_FIELD_MAX];
1883     CeedInt curr_index = 0;
1884 
1885     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1886     for (CeedInt i = 0; i < num_input_fields; i++) {
1887       CeedVector          vec_i;
1888       CeedElemRestriction rstr_i;
1889 
1890       if (is_ordered[i]) continue;
1891       field_rstr_in_buffer[i]       = i;
1892       is_ordered[i]                 = true;
1893       input_field_order[curr_index] = i;
1894       curr_index++;
1895       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1896       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1897       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1898       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1899         CeedVector          vec_j;
1900         CeedElemRestriction rstr_j;
1901 
1902         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1903         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1904         if (rstr_i == rstr_j && vec_i == vec_j) {
1905           field_rstr_in_buffer[j]       = i;
1906           is_ordered[j]                 = true;
1907           input_field_order[curr_index] = j;
1908           curr_index++;
1909         }
1910         CeedCallBackend(CeedVectorDestroy(&vec_j));
1911         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1912       }
1913       CeedCallBackend(CeedVectorDestroy(&vec_i));
1914       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1915     }
1916   }
1917 
1918   // -- Input restriction and basis
1919   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1920   CeedInt active_field_index = -1;
1921 
1922   for (CeedInt i = 0; i < num_input_fields; i++) {
1923     bool          is_active = false;
1924     const char   *field_name;
1925     const CeedInt f = input_field_order[i];
1926 
1927     {
1928       CeedVector vec;
1929 
1930       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1931       is_active = vec == CEED_VECTOR_ACTIVE;
1932       CeedCallBackend(CeedVectorDestroy(&vec));
1933     }
1934 
1935     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1936     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1937 
1938     if (is_active) {
1939       std::string var_suffix = "_in_" + std::to_string(f);
1940 
1941       code << tab << "// Active field - no restriction or basis action here\n";
1942       if (active_field_index == -1) {
1943         active_field_index = f;
1944         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
1945              << "] = {0.0};\n";
1946       } else {
1947         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
1948       }
1949     } else {
1950       // ---- Restriction
1951       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1952                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1953 
1954       // ---- Basis action
1955       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1956                                                            is_all_tensor, is_at_points, use_3d_slices));
1957     }
1958   }
1959 
1960   // -- Loop over active field
1961   std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
1962 
1963   code << "\n" << tab << "// Loop over nodes in active field\n";
1964   code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
1965        << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
1966   tab.push();
1967 
1968   // -- Set current active node and component to 1
1969   code << tab << "// Set current active node and component to 1.0\n";
1970   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
1971        << active_var_suffix << ");\n\n";
1972 
1973   for (CeedInt i = 0; i < num_input_fields; i++) {
1974     bool          is_active = false;
1975     const char   *field_name;
1976     const CeedInt f = input_field_order[i];
1977 
1978     {
1979       CeedVector vec;
1980 
1981       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1982       is_active = vec == CEED_VECTOR_ACTIVE;
1983       CeedCallBackend(CeedVectorDestroy(&vec));
1984     }
1985     if (!is_active) continue;
1986 
1987     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1988     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1989 
1990     // ---- Basis action
1991     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1992                                                          is_all_tensor, is_at_points, use_3d_slices));
1993   }
1994 
1995   // -- Q function
1996   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1997                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1998                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
1999 
2000   // -- Output basis and restriction
2001   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
2002   for (CeedInt i = 0; i < num_output_fields; i++) {
2003     bool        is_active = false;
2004     const char *field_name;
2005 
2006     {
2007       CeedVector vec;
2008 
2009       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2010       is_active = vec == CEED_VECTOR_ACTIVE;
2011       CeedCallBackend(CeedVectorDestroy(&vec));
2012     }
2013     if (!is_active) continue;
2014 
2015     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2016     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2017 
2018     // ---- Basis action
2019     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
2020                                                          is_all_tensor, is_at_points, use_3d_slices));
2021 
2022     // ---- Restriction
2023     if (is_full) {
2024       std::string         var_suffix = "_out_" + std::to_string(i);
2025       CeedInt             comp_stride;
2026       CeedSize            l_size;
2027       CeedElemRestriction elem_rstr;
2028 
2029       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2030       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2031       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2032       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2033       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2034       code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2035            << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
2036       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2037     } else {
2038       std::string         var_suffix = "_out_" + std::to_string(i);
2039       CeedInt             comp_stride;
2040       CeedSize            l_size;
2041       CeedElemRestriction elem_rstr;
2042 
2043       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2044       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2045       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2046       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2047       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2048       code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2049            << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
2050       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2051     }
2052   }
2053 
2054   // -- Reset current active node and component
2055   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2056   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
2057        << active_var_suffix << ");\n";
2058 
2059   // -- End of loop over active field
2060   tab.pop();
2061   code << tab << "}\n";
2062 
2063   // Close loop and function
2064   tab.pop();
2065   code << tab << "}\n";
2066   tab.pop();
2067   code << tab << "}\n";
2068   code << tab << "// -----------------------------------------------------------------------------\n\n";
2069 
2070   CeedInt block_sizes[3] = {0, 0, 0};
2071   CeedInt num_elem;
2072 
2073   // Compile
2074   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2075   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2076   {
2077     bool is_compile_good = false;
2078 
2079     data->thread_1d = block_sizes[0];
2080     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good,
2081                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0],
2082                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2083     if (is_compile_good) {
2084       *is_good_build = true;
2085       CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
2086                                         is_full ? &data->assemble_full : &data->assemble_diagonal));
2087     } else {
2088       *is_good_build              = false;
2089       data->use_assembly_fallback = true;
2090     }
2091   }
2092   CeedCallBackend(CeedDestroy(&ceed));
2093   CeedCallBackend(CeedQFunctionDestroy(&qf));
2094   return CEED_ERROR_SUCCESS;
2095 }
2096 
2097 extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2098   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build);
2099 }
2100 
2101 extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2102   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build);
2103 }
2104 //------------------------------------------------------------------------------
2105 // Build QFunction assembly operator kernel
2106 //------------------------------------------------------------------------------
2107 extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) {
2108   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
2109   Ceed                   ceed;
2110   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0;
2111   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
2112   CeedQFunction_Hip_gen *qf_data;
2113   CeedQFunction          qf;
2114   CeedOperatorField     *op_input_fields, *op_output_fields;
2115   CeedOperator_Hip_gen  *data;
2116   std::ostringstream     code;
2117   Tab                    tab;
2118 
2119   // Check compatibility
2120   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
2121   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
2122   CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported");
2123 
2124   // Check field compatibility
2125   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
2126   {
2127     bool has_shared_bases = true;
2128 
2129     for (CeedInt i = 0; i < num_input_fields; i++) {
2130       CeedBasis basis;
2131 
2132       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
2133       if (basis != CEED_BASIS_NONE) {
2134         bool        is_tensor = true;
2135         const char *resource;
2136         char       *resource_root;
2137         Ceed        basis_ceed;
2138 
2139         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2140         is_all_tensor    = is_all_tensor && is_tensor;
2141         is_all_nontensor = is_all_nontensor && !is_tensor;
2142         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2143         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2144         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2145         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2146         CeedCallBackend(CeedFree(&resource_root));
2147         CeedCallBackend(CeedDestroy(&basis_ceed));
2148       }
2149       CeedCallBackend(CeedBasisDestroy(&basis));
2150     }
2151 
2152     for (CeedInt i = 0; i < num_output_fields; i++) {
2153       CeedBasis basis;
2154 
2155       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
2156       if (basis != CEED_BASIS_NONE) {
2157         bool        is_tensor = true;
2158         const char *resource;
2159         char       *resource_root;
2160         Ceed        basis_ceed;
2161 
2162         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2163         is_all_tensor    = is_all_tensor && is_tensor;
2164         is_all_nontensor = is_all_nontensor && !is_tensor;
2165 
2166         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2167         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2168         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2169         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2170         CeedCallBackend(CeedFree(&resource_root));
2171         CeedCallBackend(CeedDestroy(&basis_ceed));
2172       }
2173       CeedCallBackend(CeedBasisDestroy(&basis));
2174     }
2175   }
2176 
2177   // Retrieve operator data
2178   CeedCallBackend(CeedOperatorGetData(op, &data));
2179   Q       = data->Q;
2180   Q_1d    = data->Q_1d;
2181   max_dim = data->dim;
2182   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
2183   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
2184   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
2185 
2186   // Load basis source files
2187   if (!is_all_nontensor) {
2188     code << tab << "// Tensor basis source\n";
2189     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
2190   }
2191   if (!is_all_tensor) {
2192     code << tab << "// Non-tensor basis source\n";
2193     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
2194   }
2195   if (!is_all_tensor && !is_all_nontensor) {
2196     code << "// Tensor basis source\n";
2197     code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
2198   }
2199   code << "// CodeGen operator source\n";
2200   code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
2201 
2202   // Get QFunction name
2203   std::string qfunction_name(qf_data->qfunction_name);
2204   std::string operator_name;
2205 
2206   operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name;
2207 
2208   // Define CEED_Q_VLA
2209   code << "\n" << tab << "#undef CEED_Q_VLA\n";
2210   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
2211     code << tab << "#define CEED_Q_VLA 1\n\n";
2212   } else {
2213     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
2214   }
2215 
2216   // Add user QFunction source
2217   {
2218     const char *source_path;
2219 
2220     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
2221     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
2222 
2223     code << tab << "// User QFunction source\n";
2224     code << tab << "#include \"" << source_path << "\"\n\n";
2225   }
2226 
2227   // Setup
2228   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
2229   code << tab << "// Operator Assembly Kernel\n";
2230   code << tab << "// \n";
2231   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
2232   code << tab << "// r_[in,out]_e_i: Element vector register\n";
2233   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
2234   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
2235   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
2236   code << tab << "// \n";
2237   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
2238   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
2239   code << tab << "// -----------------------------------------------------------------------------\n";
2240   code << tab << "extern \"C\" __global__ void " << operator_name
2241        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
2242           "points, CeedScalar *__restrict__ values_array) {\n";
2243   tab.push();
2244 
2245   // Scratch buffers
2246   for (CeedInt i = 0; i < num_input_fields; i++) {
2247     CeedEvalMode eval_mode;
2248 
2249     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2250     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
2251       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
2252     }
2253   }
2254   for (CeedInt i = 0; i < num_output_fields; i++) {
2255     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
2256   }
2257 
2258   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
2259   if (!is_all_tensor) {
2260     code << tab << "const CeedInt Q = " << Q << ";\n";
2261   }
2262   if (!is_all_nontensor) {
2263     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
2264   }
2265 
2266   // Shared data
2267   code << tab << "extern __shared__ CeedScalar slice[];\n";
2268   code << tab << "SharedData_Hip data;\n";
2269   code << tab << "data.t_id_x = threadIdx.x;\n";
2270   code << tab << "data.t_id_y = threadIdx.y;\n";
2271   code << tab << "data.t_id_z = threadIdx.z;\n";
2272   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
2273   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
2274 
2275   // -- Determine input mat reuse
2276   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
2277 
2278   for (CeedInt i = 0; i < num_input_fields; i++) {
2279     input_matrix_reuse[i].index = -1;
2280   }
2281   for (CeedInt i = 0; i < num_input_fields; i++) {
2282     bool         is_tensor = true;
2283     CeedEvalMode eval_mode_i;
2284     CeedBasis    basis_i;
2285 
2286     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
2287     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
2288     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
2289     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2290     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
2291       CeedEvalMode eval_mode_j;
2292       CeedBasis    basis_j;
2293 
2294       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2295       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2296       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2297       if (basis_i == basis_j) {
2298         if (is_tensor) {
2299           input_matrix_reuse[i].index     = j;
2300           input_matrix_reuse[i].is_input  = true;
2301           input_matrix_reuse[i].eval_mode = eval_mode_j;
2302         } else {
2303           // For non-tensor can only re-use with the same eval mode
2304           if (eval_mode_i == eval_mode_j) {
2305             input_matrix_reuse[i].index     = j;
2306             input_matrix_reuse[i].is_input  = true;
2307             input_matrix_reuse[i].eval_mode = eval_mode_j;
2308           }
2309         }
2310       }
2311       CeedCallBackend(CeedBasisDestroy(&basis_j));
2312     }
2313     CeedCallBackend(CeedBasisDestroy(&basis_i));
2314   }
2315 
2316   // -- Determine output mat reuse
2317   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
2318 
2319   for (CeedInt i = 0; i < num_output_fields; i++) {
2320     output_matrix_reuse[i].index = -1;
2321   }
2322   for (CeedInt i = 0; i < num_output_fields; i++) {
2323     bool         is_tensor = true;
2324     CeedEvalMode eval_mode_i;
2325     CeedBasis    basis_i;
2326 
2327     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
2328     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
2329     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2330     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
2331       CeedEvalMode eval_mode_j;
2332       CeedBasis    basis_j;
2333 
2334       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2335       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2336       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2337       if (basis_i == basis_j) {
2338         if (is_tensor) {
2339           output_matrix_reuse[i].index     = j;
2340           output_matrix_reuse[i].is_input  = true;
2341           output_matrix_reuse[i].eval_mode = eval_mode_j;
2342         } else {
2343           // For non-tensor can only re-use with the same eval mode
2344           if (eval_mode_i == eval_mode_j) {
2345             output_matrix_reuse[i].index     = j;
2346             output_matrix_reuse[i].is_input  = true;
2347             output_matrix_reuse[i].eval_mode = eval_mode_j;
2348           }
2349         }
2350       }
2351       CeedCallBackend(CeedBasisDestroy(&basis_j));
2352     }
2353     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
2354       CeedEvalMode eval_mode_j;
2355       CeedBasis    basis_j;
2356 
2357       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
2358       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2359       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
2360       if (basis_i == basis_j) {
2361         if (is_tensor) {
2362           output_matrix_reuse[i].index     = j;
2363           output_matrix_reuse[i].is_input  = false;
2364           output_matrix_reuse[i].eval_mode = eval_mode_j;
2365         } else {
2366           // For non-tensor can only re-use with the same eval mode
2367           if (eval_mode_i == eval_mode_j) {
2368             output_matrix_reuse[i].index     = j;
2369             output_matrix_reuse[i].is_input  = false;
2370             output_matrix_reuse[i].eval_mode = eval_mode_j;
2371           }
2372         }
2373       }
2374       CeedCallBackend(CeedBasisDestroy(&basis_j));
2375     }
2376     CeedCallBackend(CeedBasisDestroy(&basis_i));
2377   }
2378 
2379   // Initialize constants, and matrices B and G
2380   code << "\n" << tab << "// Input field constants and basis data\n";
2381   for (CeedInt i = 0; i < num_input_fields; i++) {
2382     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
2383                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
2384   }
2385   code << "\n" << tab << "// Output field constants and basis data\n";
2386   for (CeedInt i = 0; i < num_output_fields; i++) {
2387     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
2388                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
2389   }
2390 
2391   // Loop over all elements
2392   code << "\n" << tab << "// Element loop\n";
2393   code << tab << "__syncthreads();\n";
2394   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
2395   tab.push();
2396 
2397   // -- Compute minimum buffer space needed
2398   CeedInt max_rstr_buffer_size = 1;
2399 
2400   for (CeedInt i = 0; i < num_input_fields; i++) {
2401     CeedEvalMode eval_mode;
2402 
2403     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2404     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
2405       CeedInt             num_comp;
2406       CeedElemRestriction elem_rstr;
2407 
2408       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
2409       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2410       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2411       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2412     }
2413   }
2414   for (CeedInt i = 0; i < num_output_fields; i++) {
2415     CeedEvalMode eval_mode;
2416 
2417     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
2418     if (eval_mode != CEED_EVAL_NONE) {
2419       CeedInt             num_comp;
2420       CeedElemRestriction elem_rstr;
2421 
2422       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2423       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2424       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2425       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2426     }
2427   }
2428   code << tab << "// Scratch restriction buffer space\n";
2429   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
2430 
2431   // -- Determine best input field processing order
2432   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
2433 
2434   for (CeedInt i = 0; i < num_input_fields; i++) {
2435     field_rstr_in_buffer[i] = -1;
2436     input_field_order[i]    = -1;
2437   }
2438   {
2439     bool    is_ordered[CEED_FIELD_MAX];
2440     CeedInt curr_index = 0;
2441 
2442     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
2443     for (CeedInt i = 0; i < num_input_fields; i++) {
2444       CeedVector          vec_i;
2445       CeedElemRestriction rstr_i;
2446 
2447       if (is_ordered[i]) continue;
2448       field_rstr_in_buffer[i]       = i;
2449       is_ordered[i]                 = true;
2450       input_field_order[curr_index] = i;
2451       curr_index++;
2452       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
2453       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
2454       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
2455       for (CeedInt j = i + 1; j < num_input_fields; j++) {
2456         CeedVector          vec_j;
2457         CeedElemRestriction rstr_j;
2458 
2459         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
2460         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
2461         if (rstr_i == rstr_j && vec_i == vec_j) {
2462           field_rstr_in_buffer[j]       = i;
2463           is_ordered[j]                 = true;
2464           input_field_order[curr_index] = j;
2465           curr_index++;
2466         }
2467         CeedCallBackend(CeedVectorDestroy(&vec_j));
2468         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
2469       }
2470       CeedCallBackend(CeedVectorDestroy(&vec_i));
2471       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
2472     }
2473   }
2474 
2475   // -- Input restriction and basis
2476   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
2477   CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0;
2478   CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX];
2479 
2480   for (CeedInt i = 0; i < num_input_fields; i++) {
2481     bool          is_active = false;
2482     const char   *field_name;
2483     const CeedInt f = input_field_order[i];
2484 
2485     {
2486       CeedVector vec;
2487 
2488       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
2489       is_active = vec == CEED_VECTOR_ACTIVE;
2490       CeedCallBackend(CeedVectorDestroy(&vec));
2491     }
2492 
2493     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
2494     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
2495 
2496     if (is_active) {
2497       CeedEvalMode eval_mode;
2498       CeedInt      field_size;
2499 
2500       active_fields_in[num_active_in] = f;
2501       num_active_in++;
2502       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
2503       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
2504       if (eval_mode == CEED_EVAL_GRAD) {
2505         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
2506              << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2507       } else {
2508         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2509       }
2510       code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n";
2511     } else {
2512       // ---- Restriction
2513       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
2514                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
2515 
2516       // ---- Basis action
2517       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
2518                                                            is_all_tensor, is_at_points, use_3d_slices));
2519     }
2520   }
2521   code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {";
2522   for (CeedInt i = 0; i < num_active_in; i++) {
2523     code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2524   }
2525   code << "};\n";
2526   code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {";
2527   for (CeedInt i = 0; i < num_active_in; i++) {
2528     code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2529   }
2530   code << "};\n";
2531 
2532   for (CeedInt i = 0; i < num_output_fields; i++) {
2533     bool is_active = false;
2534 
2535     {
2536       CeedVector vec;
2537 
2538       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2539       is_active = vec == CEED_VECTOR_ACTIVE;
2540       CeedCallBackend(CeedVectorDestroy(&vec));
2541     }
2542     if (is_active) {
2543       const char *field_name;
2544       CeedInt     field_size;
2545 
2546       active_fields_out[num_active_out] = i;
2547       num_active_out++;
2548       CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2549       qf_assembly_size_out += field_size;
2550       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2551       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2552       code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n";
2553     }
2554   }
2555   code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {";
2556   for (CeedInt i = 0; i < num_active_out; i++) {
2557     code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : "");
2558   }
2559   code << "};\n";
2560   code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n";
2561 
2562   // -- Loop over active field
2563   code << "\n" << tab << "CeedInt input_offset = 0;\n";
2564   code << tab << "// Loop over active QFunction input fields\n";
2565   code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n";
2566   code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n";
2567   tab.push();
2568 
2569   // -- Loop over size of active field
2570   code << "\n" << tab << "// Loop over current active input field size\n";
2571   code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n";
2572   code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n";
2573   tab.push();
2574 
2575   // -- Set current active point and component to 1
2576   code << tab << "// Set current active point and component to 1.0\n";
2577   if (is_all_tensor && (max_dim >= 3)) {
2578     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n";
2579   } else {
2580     code << tab << "r_q_in[a][s] = 1.0;\n";
2581   }
2582 
2583   // -- Q function
2584   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
2585                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
2586                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
2587 
2588   // -- Output basis and restriction
2589   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
2590   CeedScalar offset = 0;
2591 
2592   for (CeedInt i = 0; i < num_output_fields; i++) {
2593     bool        is_active = false;
2594     const char *field_name;
2595 
2596     {
2597       CeedVector vec;
2598 
2599       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2600       is_active = vec == CEED_VECTOR_ACTIVE;
2601       CeedCallBackend(CeedVectorDestroy(&vec));
2602     }
2603     if (!is_active) continue;
2604 
2605     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2606     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2607 
2608     // ---- Restriction
2609     CeedInt field_size;
2610 
2611     code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
2612          << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
2613     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2614     offset += field_size;
2615   }
2616 
2617   // -- Reset current active node and component
2618   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2619   if (is_all_tensor && (max_dim >= 3)) {
2620     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n";
2621   } else {
2622     code << tab << "r_q_in[a][s] = 0.0;\n";
2623   }
2624 
2625   // -- End of loop over size of active field
2626   tab.pop();
2627   code << tab << "}\n";
2628   code << tab << "input_offset += field_size_in;\n";
2629 
2630   // -- End of loop over active field
2631   tab.pop();
2632   code << tab << "}\n";
2633 
2634   // Close loop and function
2635   tab.pop();
2636   code << tab << "}\n";
2637   tab.pop();
2638   code << tab << "}\n";
2639   code << tab << "// -----------------------------------------------------------------------------\n\n";
2640 
2641   CeedInt block_sizes[3] = {0, 0, 0};
2642   CeedInt num_elem;
2643 
2644   // Compile
2645   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2646   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2647   {
2648     bool is_compile_good = false;
2649 
2650     data->thread_1d = block_sizes[0];
2651     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0],
2652                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2653     if (is_compile_good) {
2654       *is_good_build = true;
2655       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
2656     } else {
2657       *is_good_build              = false;
2658       data->use_assembly_fallback = true;
2659     }
2660   }
2661   CeedCallBackend(CeedDestroy(&ceed));
2662   CeedCallBackend(CeedQFunctionDestroy(&qf));
2663   return CEED_ERROR_SUCCESS;
2664 }
2665 
2666 //------------------------------------------------------------------------------
2667