xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 745f16d1ea9fdcf634671fe63363ae0b5c9a02f9)
1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/gen-tools.h>
13 #include <ceed/jit-tools.h>
14 
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 
19 #include "../hip-ref/ceed-hip-ref.h"
20 #include "../hip-shared/ceed-hip-shared.h"
21 #include "../hip/ceed-hip-common.h"
22 #include "../hip/ceed-hip-compile.h"
23 #include "ceed-hip-gen.h"
24 
25 struct FieldReuse_Hip {
26   CeedInt      index;
27   bool         is_input;
28   CeedEvalMode eval_mode;
29 };
30 
31 //------------------------------------------------------------------------------
32 // Calculate the block size used for launching the operator kernel
33 //------------------------------------------------------------------------------
34 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
35   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
36   if (dim == 1) {
37     CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
38 
39     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
40     block_sizes[0]  = thread_1d;
41     block_sizes[1]  = 1;
42     block_sizes[2]  = elems_per_block;
43   } else if (dim == 2) {
44     const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2;
45 
46     block_sizes[0] = thread_1d;
47     block_sizes[1] = thread_1d;
48     block_sizes[2] = elems_per_block;
49   } else if (dim == 3) {
50     const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1);
51 
52     block_sizes[0] = thread_1d;
53     block_sizes[1] = thread_1d;
54     block_sizes[2] = elems_per_block;
55   }
56   return CEED_ERROR_SUCCESS;
57 }
58 
59 //------------------------------------------------------------------------------
60 // Determine type of operator
61 //------------------------------------------------------------------------------
62 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
63                                                CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
64                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
65                                                CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
66   // Check if all are tensor
67   *is_all_tensor = true;
68   for (CeedInt i = 0; i < num_input_fields; i++) {
69     CeedBasis basis;
70 
71     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
72     if (basis != CEED_BASIS_NONE) {
73       bool is_field_tensor;
74 
75       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
76       *is_all_tensor = *is_all_tensor && is_field_tensor;
77     }
78     CeedCallBackend(CeedBasisDestroy(&basis));
79   }
80   for (CeedInt i = 0; i < num_output_fields; i++) {
81     CeedBasis basis;
82 
83     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
84     if (basis != CEED_BASIS_NONE) {
85       bool is_field_tensor;
86 
87       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
88       *is_all_tensor = *is_all_tensor && is_field_tensor;
89     }
90     CeedCallBackend(CeedBasisDestroy(&basis));
91   }
92 
93   // Find max_P, max_P_1d, Q, and Q_1d
94   bool is_all_3d = true;
95 
96   *max_P    = 0;
97   *max_P_1d = 0;
98   *Q        = 0;
99   *Q_1d     = 0;
100   for (CeedInt i = 0; i < num_input_fields; i++) {
101     CeedBasis basis;
102 
103     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
104     if (basis != CEED_BASIS_NONE) {
105       bool    is_field_tensor;
106       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
107 
108       // Check if 3D
109       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
110       is_all_3d = is_all_3d && (field_dim == 3);
111       *max_dim  = CeedIntMax(*max_dim, field_dim);
112 
113       // Collect P, P_1d, Q, and Q_1d
114       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
115       *max_P = CeedIntMax(*max_P, field_P);
116       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
117       if (is_field_tensor) {
118         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
119         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
120       }
121       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
122       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
123       *Q = field_Q;
124       if (is_field_tensor) {
125         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
126         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
127         *Q_1d = field_Q_1d;
128       }
129     }
130     CeedCallBackend(CeedBasisDestroy(&basis));
131   }
132   for (CeedInt i = 0; i < num_output_fields; i++) {
133     CeedBasis basis;
134 
135     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
136     if (basis != CEED_BASIS_NONE) {
137       bool    is_field_tensor;
138       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
139 
140       // Check if 3D
141       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
142       is_all_3d = is_all_3d && (field_dim == 3);
143       *max_dim  = CeedIntMax(*max_dim, field_dim);
144 
145       // Collect P, P_1d, Q, and Q_1d
146       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
147       *max_P = CeedIntMax(*max_P, field_P);
148       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
149       if (is_field_tensor) {
150         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
151         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
152       }
153       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
154       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
155       *Q = field_Q;
156       if (is_field_tensor) {
157         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
158         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
159         *Q_1d = field_Q_1d;
160       }
161     }
162     CeedCallBackend(CeedBasisDestroy(&basis));
163   }
164 
165   // Only use 3D collocated gradient parallelization strategy when gradient is computed
166   *use_3d_slices = false;
167   if (is_all_3d && *is_all_tensor) {
168     bool was_grad_found = false;
169 
170     for (CeedInt i = 0; i < num_input_fields; i++) {
171       CeedEvalMode eval_mode;
172 
173       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
174       if (eval_mode == CEED_EVAL_GRAD) {
175         CeedBasis_Hip_shared *basis_data;
176         CeedBasis             basis;
177 
178         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
179         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
180         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
181         was_grad_found = true;
182         CeedCallBackend(CeedBasisDestroy(&basis));
183       }
184     }
185     for (CeedInt i = 0; i < num_output_fields; i++) {
186       CeedEvalMode eval_mode;
187 
188       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
189       if (eval_mode == CEED_EVAL_GRAD) {
190         CeedBasis_Hip_shared *basis_data;
191         CeedBasis             basis;
192 
193         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
194         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
195         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
196         was_grad_found = true;
197         CeedCallBackend(CeedBasisDestroy(&basis));
198       }
199     }
200   }
201   return CEED_ERROR_SUCCESS;
202 }
203 
204 //------------------------------------------------------------------------------
205 // Setup fields
206 //------------------------------------------------------------------------------
207 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
208                                                     CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
209                                                     CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
210                                                     bool use_3d_slices, bool skip_active_load) {
211   bool      is_tensor = true, is_active = true;
212   CeedBasis basis;
213 
214   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
215   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
216   {
217     CeedVector vec;
218 
219     CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec));
220     is_active = vec == CEED_VECTOR_ACTIVE;
221     CeedCallBackend(CeedVectorDestroy(&vec));
222   }
223 
224   const char           *field_name;
225   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
226   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
227   std::string           option_name = (is_input ? "inputs" : "outputs");
228   CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
229   CeedInt               elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
230   CeedElemRestriction   elem_rstr;
231   CeedBasis_Hip_shared *basis_data;
232 
233   // Field reuse info
234   bool use_previous_field = field_reuse.index != -1;
235 
236   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
237   code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
238 
239   // Get field data
240   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
241   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
242     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
243     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
244   }
245   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
246   if (basis != CEED_BASIS_NONE) {
247     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
248     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
249     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
250     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
251   }
252   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
253 
254   // Set field constants
255   code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
256   if (is_tensor && !is_all_tensor) {
257     CeedInt P = 0;
258 
259     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
260     code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
261   }
262   code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
263   if (eval_mode != CEED_EVAL_WEIGHT) {
264     code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
265   }
266 
267   // Load basis data
268   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
269   switch (eval_mode) {
270     case CEED_EVAL_NONE:
271       break;
272     case CEED_EVAL_INTERP:
273       if (is_at_points) {
274         // AtPoints
275         if (!basis_data->d_chebyshev_interp_1d) {
276           CeedSize    interp_bytes;
277           CeedScalar *chebyshev_interp_1d;
278 
279           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
280           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
281           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
282           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
283           CeedCallHip(CeedBasisReturnCeed(basis),
284                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
285           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
286         }
287         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
288         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
289       } else {
290         // Standard quadrature
291         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
292         else data->B.outputs[i] = basis_data->d_interp_1d;
293       }
294       if (use_previous_field && !skip_active_load) {
295         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
296 
297         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
298       } else {
299         bool is_collocated = false;
300 
301         CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
302         if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
303           code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
304         } else {
305           code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
306           code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
307         }
308       }
309       break;
310     case CEED_EVAL_GRAD:
311       if (is_at_points) {
312         // AtPoints
313         if (!basis_data->d_chebyshev_interp_1d) {
314           CeedSize    interp_bytes;
315           CeedScalar *chebyshev_interp_1d;
316 
317           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
318           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
319           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
320           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
321           CeedCallHip(CeedBasisReturnCeed(basis),
322                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
323           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
324         }
325         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
326         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
327       } else {
328         // Standard quadrature
329         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
330         else data->B.outputs[i] = basis_data->d_interp_1d;
331       }
332       if (is_tensor) {
333         if (use_previous_field && !skip_active_load) {
334           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
335 
336           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
337         } else {
338           bool is_collocated = false;
339 
340           CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
341           if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
342             code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
343           } else {
344             code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
345             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
346           }
347         }
348       }
349       if (is_at_points) break;  // No G mat for AtPoints
350       if (use_3d_slices) {
351         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
352         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
353         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
354           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
355 
356           code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
357         } else if (is_active && skip_active_load) {
358           code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
359         } else {
360           code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
361           code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
362         }
363       } else {
364         bool has_collo_grad = basis_data->d_collo_grad_1d;
365 
366         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
367         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
368         if (has_collo_grad) {
369           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
370             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
371 
372             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
373           } else if (is_active && skip_active_load) {
374             code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
375           } else {
376             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
377             code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
378           }
379         } else {
380           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
381             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
382 
383             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
384           } else if (is_active && skip_active_load) {
385             code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
386           } else {
387             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
388                  << (is_tensor ? "" : var_suffix) << "];\n";
389             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
390                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
391           }
392         }
393       }
394       break;
395     case CEED_EVAL_WEIGHT:
396       break;  // No action
397       // LCOV_EXCL_START
398     case CEED_EVAL_DIV:
399     case CEED_EVAL_CURL:
400       break;  // TODO: Not implemented
401               // LCOV_EXCL_STOP
402   }
403   CeedCallBackend(CeedBasisDestroy(&basis));
404   return CEED_ERROR_SUCCESS;
405 }
406 
407 //------------------------------------------------------------------------------
408 // Restriction
409 //------------------------------------------------------------------------------
410 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
411                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
412                                                       CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
413                                                       bool use_3d_slices) {
414   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
415   std::string              P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
416   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
417   CeedInt                  elem_size = 0, num_comp = 0;
418   CeedSize                 l_size;
419   CeedRestrictionType      rstr_type = CEED_RESTRICTION_STANDARD;
420   CeedElemRestriction_Hip *rstr_data;
421   CeedElemRestriction      elem_rstr;
422 
423   // Get field data
424   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
425   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
426     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
427     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
428     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
429     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
430   }
431   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
432 
433   // Restriction
434   if (is_input) {
435     // Input
436     if (field_input_buffer[i] != i) {
437       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
438 
439       // Restriction was already done for previous input
440       code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
441     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
442       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
443         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
444         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
445       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
446         // Otherwise we're using the scratch space
447         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
448       }
449       switch (rstr_type) {
450         case CEED_RESTRICTION_STANDARD: {
451           CeedInt comp_stride;
452 
453           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
454           code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
455           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
456           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
457           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
458           code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
459                << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
460                << ");\n";
461           break;
462         }
463         case CEED_RESTRICTION_STRIDED: {
464           bool    has_backend_strides;
465           CeedInt num_elem;
466 
467           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
468           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
469           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
470 
471           if (!has_backend_strides) {
472             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
473           }
474           code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
475                << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
476           code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
477                << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
478                << var_suffix << ");\n";
479           break;
480         }
481         case CEED_RESTRICTION_POINTS: {
482           CeedInt comp_stride;
483 
484           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
485           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
486           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
487           break;
488         }
489         // LCOV_EXCL_START
490         case CEED_RESTRICTION_ORIENTED:
491         case CEED_RESTRICTION_CURL_ORIENTED:
492           break;  // TODO: Not implemented
493                   // LCOV_EXCL_STOP
494       }
495     }
496   } else {
497     // Output
498     switch (rstr_type) {
499       case CEED_RESTRICTION_STANDARD: {
500         CeedInt comp_stride;
501 
502         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
503         code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
504         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
505         code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
506         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
507         code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
508              << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
509              << ");\n";
510         break;
511       }
512       case CEED_RESTRICTION_STRIDED: {
513         bool    has_backend_strides;
514         CeedInt num_elem;
515 
516         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
517         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
518         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
519 
520         if (!has_backend_strides) {
521           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
522         }
523         code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
524              << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
525         code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
526              << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
527              << ");\n";
528         break;
529       }
530       case CEED_RESTRICTION_POINTS:
531         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
532         break;
533       // LCOV_EXCL_START
534       case CEED_RESTRICTION_ORIENTED:
535       case CEED_RESTRICTION_CURL_ORIENTED:
536         break;  // TODO: Not implemented
537                 // LCOV_EXCL_STOP
538     }
539   }
540   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
541   return CEED_ERROR_SUCCESS;
542 }
543 
544 //------------------------------------------------------------------------------
545 // Basis
546 //------------------------------------------------------------------------------
547 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field,
548                                                 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
549                                                 bool is_at_points, bool use_3d_slices) {
550   bool      is_tensor = true, is_collocated = true;
551   CeedBasis basis;
552   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
553   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
554   CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
555 
556   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
557   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
558   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
559   CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
560   CeedElemRestriction elem_rstr;
561 
562   // Get field data
563   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
564   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
565     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
566     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
567   }
568   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
569   if (basis != CEED_BASIS_NONE) {
570     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
571     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
572     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
573   }
574   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
575 
576   // Basis
577   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
578   if (is_input) {
579     switch (eval_mode) {
580       case CEED_EVAL_NONE:
581         if (!use_3d_slices && !is_at_points) {
582           code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
583         }
584         break;
585       case CEED_EVAL_INTERP:
586         if (is_at_points) {
587           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
588 
589           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
590           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
591                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
592         } else {
593           std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
594                                                    std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
595                                                 : "InterpNonTensor";
596           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
597 
598           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
599           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
600                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
601         }
602         break;
603       case CEED_EVAL_GRAD:
604         if (is_at_points) {
605           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
606 
607           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
608           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
609                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
610         } else if (use_3d_slices) {
611           std::string function_name =
612               (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d";
613 
614           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
615           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
616                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
617         } else if (is_tensor) {
618           bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
619           std::string function_name =
620               (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
621               std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
622           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
623 
624           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
625                << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
626           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
627                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
628         } else {
629           std::string function_name = "GradNonTensor";
630 
631           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
632           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
633                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
634         }
635         break;
636       case CEED_EVAL_WEIGHT: {
637         if (is_at_points) {
638           code << tab << "// Nothing to do AtPoints\n";
639         } else {
640           CeedBasis_Hip_shared *basis_data;
641           std::string           function_name = is_tensor
642                                                     ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
643                                                     : "WeightNonTensor";
644 
645           code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
646           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
647           data->W = basis_data->d_q_weight_1d;
648           code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
649         }
650         break;
651       }
652       // LCOV_EXCL_START
653       case CEED_EVAL_DIV:
654       case CEED_EVAL_CURL:
655         break;  // TODO: Not implemented
656                 // LCOV_EXCL_STOP
657     }
658   } else {
659     switch (eval_mode) {
660       case CEED_EVAL_NONE:
661         code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
662         break;  // No action
663       case CEED_EVAL_INTERP:
664         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
665         if (is_at_points) {
666           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
667 
668           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
669                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
670         } else {
671           std::string function_name =
672               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
673                            std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
674                         : "InterpTransposeNonTensor";
675           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
676 
677           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
678                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
679         }
680         break;
681       case CEED_EVAL_GRAD:
682         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
683         if (is_at_points) {
684           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
685 
686           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
687                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
688         } else if (use_3d_slices) {
689           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
690                                       std::to_string(dim) + "d";
691 
692           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
693                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
694         } else if (is_tensor) {
695           bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
696           std::string function_name =
697               (dim == 1 ? "GradTranspose"
698                         : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
699               std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
700           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
701 
702           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
703                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
704         } else {
705           std::string function_name = "GradTransposeNonTensor";
706 
707           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
708                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
709         }
710         break;
711       // LCOV_EXCL_START
712       case CEED_EVAL_WEIGHT:
713         break;  // Should not occur
714       case CEED_EVAL_DIV:
715       case CEED_EVAL_CURL:
716         break;  // TODO: Not implemented
717                 // LCOV_EXCL_STOP
718     }
719   }
720   CeedCallBackend(CeedBasisDestroy(&basis));
721   return CEED_ERROR_SUCCESS;
722 }
723 
724 //------------------------------------------------------------------------------
725 // QFunction
726 //------------------------------------------------------------------------------
727 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim,
728                                                     CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
729                                                     CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
730                                                     CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
731                                                     std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
732                                                     bool use_3d_slices, bool is_assemble) {
733   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
734   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
735   CeedElemRestriction elem_rstr;
736 
737   // Setup output arrays
738   code << "\n";
739   code << tab << "// -- Output field setup\n";
740   for (CeedInt i = 0; i < num_output_fields; i++) {
741     const char *field_name;
742     std::string var_suffix = "_out_" + std::to_string(i);
743 
744     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
745     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
746     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
747     switch (eval_mode) {
748       case CEED_EVAL_NONE:
749         if (is_at_points) {
750           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
751         } else {
752           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
753                << "];\n";
754         }
755         break;
756       case CEED_EVAL_INTERP:
757         if (is_at_points) {
758           // Accumulator for point data
759           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
760           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
761                << "[i] = 0.0;\n";
762         } else {
763           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
764                << "];\n";
765         }
766         break;
767       case CEED_EVAL_GRAD:
768         if (is_at_points) {
769           // Accumulator for point data
770           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
771           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
772                << "[i] = 0.0;\n";
773         } else if (use_3d_slices) {
774           // Accumulator for gradient slices
775           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
776           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
777         } else {
778           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
779                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
780         }
781         break;
782       case CEED_EVAL_WEIGHT:
783         break;
784         // LCOV_EXCL_START
785       case CEED_EVAL_DIV:
786       case CEED_EVAL_CURL:
787         break;  // TODO: Not implemented
788                 // LCOV_EXCL_STOP
789     }
790   }
791 
792   if (is_at_points) {
793     // We need to handle batches of points
794     code << "\n";
795     code << tab << "// Note: Using batches of points\n";
796     code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
797     code << tab << "#pragma unroll\n";
798     code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
799     tab.push();
800     code << tab << "const CeedInt p = i % max_num_points;\n\n";
801 
802     code << tab << "// -- Coordinates\n";
803     code << tab << "CeedScalar r_x[max_dim];\n";
804     code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
805 
806     code << tab << "// -- Input fields\n";
807     for (CeedInt i = 0; i < num_input_fields; i++) {
808       const char *field_name;
809       std::string var_suffix = "_in_" + std::to_string(i);
810       std::string P_name     = "P_1d" + var_suffix;
811 
812       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
813       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
814       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
815       // Basis action
816       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
817       switch (eval_mode) {
818         case CEED_EVAL_NONE:
819           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
820           code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
821                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
822           break;
823         case CEED_EVAL_INTERP:
824           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
825           code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
826                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
827           break;
828         case CEED_EVAL_GRAD:
829           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
830           code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
831                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
832           break;
833         case CEED_EVAL_WEIGHT:
834           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
835           code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
836           break;
837           // LCOV_EXCL_START
838         case CEED_EVAL_DIV:
839         case CEED_EVAL_CURL:
840           break;  // TODO: Not implemented
841                   // LCOV_EXCL_STOP
842       }
843     }
844     code << "\n";
845     code << tab << "// -- Output fields\n";
846     for (CeedInt i = 0; i < num_output_fields; i++) {
847       const char *field_name;
848       std::string var_suffix = "_out_" + std::to_string(i);
849 
850       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
851       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
852       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
853       // Basis action
854       switch (eval_mode) {
855         case CEED_EVAL_NONE:
856           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
857           break;
858         case CEED_EVAL_INTERP:
859           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
860           break;
861         case CEED_EVAL_GRAD:
862           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
863           break;
864           // LCOV_EXCL_START
865         case CEED_EVAL_WEIGHT:
866           break;  // Should not occur
867         case CEED_EVAL_DIV:
868         case CEED_EVAL_CURL:
869           break;  // TODO: Not implemented
870                   // LCOV_EXCL_STOP
871       }
872     }
873 
874   } else if (use_3d_slices) {
875     // We treat quadrature points per slice in 3d to save registers
876     code << "\n";
877     code << tab << "// Note: Using planes of 3D elements\n";
878     code << tab << "#pragma unroll\n";
879     code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
880     tab.push();
881     code << tab << "// -- Input fields\n";
882     for (CeedInt i = 0; i < num_input_fields; i++) {
883       const char *field_name;
884       std::string var_suffix = "_in_" + std::to_string(i);
885 
886       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
887       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
888       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
889       // Basis action
890       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
891       switch (eval_mode) {
892         case CEED_EVAL_NONE:
893           bool is_strided;
894 
895           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
896 
897           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
898           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
899           if (is_strided) {
900             bool    has_backend_strides;
901             CeedInt num_elem, elem_size;
902 
903             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
904             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
905             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
906             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
907 
908             if (!has_backend_strides) {
909               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
910             }
911             code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
912                  << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
913             code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
914                  << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
915           } else {
916             CeedSize                 l_size = 0;
917             CeedInt                  comp_stride;
918             CeedElemRestriction_Hip *rstr_data;
919 
920             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
921             code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
922             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
923             code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
924             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
925             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
926             code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
927                  << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
928           }
929           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
930           break;
931         case CEED_EVAL_INTERP:
932           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
933           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
934           tab.push();
935           code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
936           tab.pop();
937           code << tab << "}\n";
938           break;
939         case CEED_EVAL_GRAD:
940           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
941           code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
942                << var_suffix << ", r_s" << var_suffix << ");\n";
943           break;
944         case CEED_EVAL_WEIGHT:
945           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
946           code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
947           break;
948           // LCOV_EXCL_START
949         case CEED_EVAL_DIV:
950         case CEED_EVAL_CURL:
951           break;  // TODO: Not implemented
952                   // LCOV_EXCL_STOP
953       }
954     }
955     code << "\n";
956     code << tab << "// -- Output fields\n";
957     for (CeedInt i = 0; i < num_output_fields; i++) {
958       const char *field_name;
959       std::string var_suffix = "_out_" + std::to_string(i);
960 
961       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
962       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
963       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
964       // Basis action
965       switch (eval_mode) {
966         case CEED_EVAL_NONE:
967           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
968           break;
969         case CEED_EVAL_INTERP:
970           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
971           break;
972         case CEED_EVAL_GRAD:
973           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
974           break;
975           // LCOV_EXCL_START
976         case CEED_EVAL_WEIGHT:
977           break;  // Should not occur
978         case CEED_EVAL_DIV:
979         case CEED_EVAL_CURL:
980           break;  // TODO: Not implemented
981                   // LCOV_EXCL_STOP
982       }
983     }
984   } else {
985     code << "\n";
986     code << tab << "// Note: Using full elements\n";
987     code << tab << "{\n";
988     tab.push();
989     code << tab << "// -- Input fields\n";
990     for (CeedInt i = 0; i < num_input_fields; i++) {
991       const char *field_name;
992 
993       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
994       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
995       code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
996     }
997     code << tab << "// -- Output fields\n";
998     for (CeedInt i = 0; i < num_output_fields; i++) {
999       const char *field_name;
1000 
1001       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1002       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1003       code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
1004     }
1005   }
1006 
1007   // Input and output buffers
1008   code << "\n";
1009   code << tab << "// -- QFunction inputs and outputs\n";
1010   code << tab << "// ---- Inputs\n";
1011   code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
1012   for (CeedInt i = 0; i < num_input_fields; i++) {
1013     const char *field_name;
1014 
1015     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
1016     code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
1017     code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
1018   }
1019   code << tab << "// ---- Outputs\n";
1020   code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
1021   for (CeedInt i = 0; i < num_output_fields; i++) {
1022     const char *field_name;
1023 
1024     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1025     code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
1026     code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
1027   }
1028 
1029   // Apply QFunction
1030   code << "\n";
1031   code << tab << "// -- Apply QFunction\n";
1032   code << tab << "" << qfunction_name << "(ctx, ";
1033   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1034     code << "1";
1035   } else {
1036     code << Q_name;
1037   }
1038   code << ", inputs, outputs);\n";
1039 
1040   if (is_at_points) {
1041     // Map back to coefficients
1042     code << "\n";
1043     code << tab << "// -- Output fields\n";
1044     for (CeedInt i = 0; i < num_output_fields; i++) {
1045       const char *field_name;
1046       std::string var_suffix = "_out_" + std::to_string(i);
1047       std::string P_name     = "P_1d" + var_suffix;
1048 
1049       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1050       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1051       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1052       // Basis action
1053       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1054       switch (eval_mode) {
1055         case CEED_EVAL_NONE: {
1056           CeedInt             comp_stride;
1057           CeedElemRestriction elem_rstr;
1058 
1059           if (is_assemble) break;
1060           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1061           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
1062           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1063           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
1064           code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
1065                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
1066                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
1067           break;
1068         }
1069         case CEED_EVAL_INTERP:
1070           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1071           tab.push();
1072           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1073           tab.pop();
1074           code << tab << "}\n";
1075           code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1076                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1077           break;
1078         case CEED_EVAL_GRAD:
1079           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1080           tab.push();
1081           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1082           tab.pop();
1083           code << tab << "}\n";
1084           code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1085                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1086           break;
1087           // LCOV_EXCL_START
1088         case CEED_EVAL_WEIGHT:
1089           break;  // Should not occur
1090         case CEED_EVAL_DIV:
1091         case CEED_EVAL_CURL:
1092           break;  // TODO: Not implemented
1093                   // LCOV_EXCL_STOP
1094       }
1095     }
1096   } else if (use_3d_slices) {
1097     // Copy or apply transpose grad, if needed
1098     code << "\n";
1099     code << tab << "// -- Output fields\n";
1100     for (CeedInt i = 0; i < num_output_fields; i++) {
1101       const char *field_name;
1102       std::string var_suffix = "_out_" + std::to_string(i);
1103       std::string P_name     = "P_1d" + var_suffix;
1104 
1105       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1106       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1107       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1108       // Basis action
1109       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1110       switch (eval_mode) {
1111         case CEED_EVAL_NONE:
1112           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1113           tab.push();
1114           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1115           tab.pop();
1116           code << tab << "}\n";
1117           break;
1118         case CEED_EVAL_INTERP:
1119           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1120           tab.push();
1121           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1122           tab.pop();
1123           code << tab << "}\n";
1124           break;
1125         case CEED_EVAL_GRAD:
1126           code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1127                << var_suffix << ", r_q" << var_suffix << ");\n";
1128           break;
1129           // LCOV_EXCL_START
1130         case CEED_EVAL_WEIGHT:
1131           break;  // Should not occur
1132         case CEED_EVAL_DIV:
1133         case CEED_EVAL_CURL:
1134           break;  // TODO: Not implemented
1135                   // LCOV_EXCL_STOP
1136       }
1137     }
1138   }
1139   tab.pop();
1140   code << tab << "}\n";
1141   return CEED_ERROR_SUCCESS;
1142 }
1143 
1144 //------------------------------------------------------------------------------
1145 // Build single operator kernel
1146 //------------------------------------------------------------------------------
1147 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
1148   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
1149   Ceed                   ceed;
1150   CeedInt                Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1151   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1152   CeedQFunction_Hip_gen *qf_data;
1153   CeedQFunction          qf;
1154   CeedOperatorField     *op_input_fields, *op_output_fields;
1155   CeedOperator_Hip_gen  *data;
1156   std::ostringstream     code;
1157   Tab                    tab;
1158 
1159   CeedCallBackend(CeedOperatorGetData(op, &data));
1160   {
1161     bool is_setup_done;
1162 
1163     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
1164     if (is_setup_done) {
1165       *is_good_build = !data->use_fallback;
1166       return CEED_ERROR_SUCCESS;
1167     }
1168   }
1169 
1170   // Check field compatibility
1171   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1172   {
1173     bool has_shared_bases = true;
1174 
1175     for (CeedInt i = 0; i < num_input_fields; i++) {
1176       CeedBasis basis;
1177 
1178       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1179       if (basis != CEED_BASIS_NONE) {
1180         bool        is_tensor = true;
1181         const char *resource;
1182         char       *resource_root;
1183         Ceed        basis_ceed;
1184 
1185         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1186         is_all_tensor    = is_all_tensor && is_tensor;
1187         is_all_nontensor = is_all_nontensor && !is_tensor;
1188         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1189         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1190         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1191         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1192         CeedCallBackend(CeedFree(&resource_root));
1193         CeedCallBackend(CeedDestroy(&basis_ceed));
1194       }
1195       CeedCallBackend(CeedBasisDestroy(&basis));
1196     }
1197 
1198     for (CeedInt i = 0; i < num_output_fields; i++) {
1199       CeedBasis basis;
1200 
1201       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1202       if (basis != CEED_BASIS_NONE) {
1203         bool        is_tensor = true;
1204         const char *resource;
1205         char       *resource_root;
1206         Ceed        basis_ceed;
1207 
1208         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1209         is_all_tensor    = is_all_tensor && is_tensor;
1210         is_all_nontensor = is_all_nontensor && !is_tensor;
1211 
1212         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1213         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1214         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1215         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1216         CeedCallBackend(CeedFree(&resource_root));
1217         CeedCallBackend(CeedDestroy(&basis_ceed));
1218       }
1219       CeedCallBackend(CeedBasisDestroy(&basis));
1220     }
1221     // -- Fallback to ref if not all bases are shared
1222     if (!has_shared_bases) {
1223       *is_good_build = false;
1224       return CEED_ERROR_SUCCESS;
1225     }
1226   }
1227   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1228   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1229   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1230   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1231 
1232   // Get operator data
1233   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1234   {
1235     CeedInt max_P = 0, max_P_1d = 0;
1236 
1237     CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
1238                                                         qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));
1239     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
1240   }
1241   if (is_at_points) {
1242     CeedInt                  coords_dim = 0;
1243     CeedElemRestriction_Hip *rstr_data;
1244     CeedElemRestriction      rstr_points = NULL;
1245 
1246     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1247     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1248     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1249     CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim));
1250     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1251     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1252     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1253     if (max_dim == 0) max_dim = coords_dim;
1254     if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim));
1255   }
1256   if (max_dim == 0) max_dim = 1;
1257   data->dim = max_dim;
1258   if (is_at_points) use_3d_slices = false;
1259   if (Q_1d == 0) {
1260     if (is_at_points) Q_1d = max_num_points;
1261     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1262   }
1263   if (Q == 0) Q = Q_1d;
1264   data->Q    = Q;
1265   data->Q_1d = Q_1d;
1266 
1267   // Check for restriction only identity operator
1268   {
1269     bool is_identity_qf;
1270 
1271     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1272     if (is_identity_qf) {
1273       CeedEvalMode eval_mode_in, eval_mode_out;
1274 
1275       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1276       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1277       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1278                 "Backend does not implement restriction only identity operators");
1279     }
1280   }
1281 
1282   // Load basis source files
1283   if (!is_all_nontensor) {
1284     code << tab << "// Tensor basis source\n";
1285     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1286   }
1287   if (!is_all_tensor) {
1288     code << tab << "// Non-tensor basis source\n";
1289     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
1290   }
1291   if (is_at_points) {
1292     code << tab << "// AtPoints basis source\n";
1293     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1294   }
1295   if (!is_all_tensor && !is_all_nontensor) {
1296     code << tab << "// Tensor basis source\n";
1297     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
1298   }
1299   code << tab << "// CodeGen operator source\n";
1300   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1301 
1302   // Get QFunction name
1303   std::string qfunction_name(qf_data->qfunction_name);
1304   std::string operator_name;
1305 
1306   operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
1307 
1308   // Define CEED_Q_VLA
1309   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1310   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1311     code << tab << "#define CEED_Q_VLA 1\n\n";
1312   } else {
1313     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1314   }
1315 
1316   // Add user QFunction source
1317   {
1318     const char *source_path;
1319 
1320     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1321     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1322 
1323     code << tab << "// User QFunction source\n";
1324     code << tab << "#include \"" << source_path << "\"\n\n";
1325   }
1326 
1327   // Setup
1328   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1329   code << tab << "// Operator Kernel\n";
1330   code << tab << "// \n";
1331   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1332   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1333   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1334   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1335   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1336   code << tab << "// \n";
1337   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1338   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1339   code << tab << "// -----------------------------------------------------------------------------\n";
1340   code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
1341   code << "__global__ void " << operator_name
1342        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
1343   tab.push();
1344 
1345   // Scratch buffers
1346   for (CeedInt i = 0; i < num_input_fields; i++) {
1347     CeedEvalMode eval_mode;
1348 
1349     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1350     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1351       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1352     }
1353   }
1354   for (CeedInt i = 0; i < num_output_fields; i++) {
1355     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1356   }
1357 
1358   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1359   if (!is_all_tensor) {
1360     code << tab << "const CeedInt Q = " << Q << ";\n";
1361   }
1362   if (!is_all_nontensor) {
1363     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1364   }
1365   if (is_at_points) {
1366     code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1367     code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1368   }
1369 
1370   // Shared data
1371   code << tab << "extern __shared__ CeedScalar slice[];\n";
1372   code << tab << "SharedData_Hip data;\n";
1373   code << tab << "data.t_id_x = threadIdx.x;\n";
1374   code << tab << "data.t_id_y = threadIdx.y;\n";
1375   code << tab << "data.t_id_z = threadIdx.z;\n";
1376   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1377   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1378 
1379   // -- Determine input mat reuse
1380   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1381 
1382   for (CeedInt i = 0; i < num_input_fields; i++) {
1383     input_matrix_reuse[i].index = -1;
1384   }
1385   for (CeedInt i = 0; i < num_input_fields; i++) {
1386     bool         is_tensor = true;
1387     CeedEvalMode eval_mode_i;
1388     CeedBasis    basis_i;
1389 
1390     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1391     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1392     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1393     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1394     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1395       CeedEvalMode eval_mode_j;
1396       CeedBasis    basis_j;
1397 
1398       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1399       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1400       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1401       if (basis_i == basis_j) {
1402         if (is_tensor) {
1403           input_matrix_reuse[i].index     = j;
1404           input_matrix_reuse[i].is_input  = true;
1405           input_matrix_reuse[i].eval_mode = eval_mode_j;
1406         } else {
1407           // For non-tensor can only re-use with the same eval mode
1408           if (eval_mode_i == eval_mode_j) {
1409             input_matrix_reuse[i].index     = j;
1410             input_matrix_reuse[i].is_input  = true;
1411             input_matrix_reuse[i].eval_mode = eval_mode_j;
1412           }
1413         }
1414       }
1415       CeedCallBackend(CeedBasisDestroy(&basis_j));
1416     }
1417     CeedCallBackend(CeedBasisDestroy(&basis_i));
1418   }
1419 
1420   // -- Determine output mat reuse
1421   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1422 
1423   for (CeedInt i = 0; i < num_output_fields; i++) {
1424     output_matrix_reuse[i].index = -1;
1425   }
1426   for (CeedInt i = 0; i < num_output_fields; i++) {
1427     bool         is_tensor = true;
1428     CeedEvalMode eval_mode_i;
1429     CeedBasis    basis_i;
1430 
1431     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1432     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1433     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1434       CeedEvalMode eval_mode_j;
1435       CeedBasis    basis_j;
1436 
1437       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1438       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1439       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1440       if (basis_i == basis_j) {
1441         if (is_tensor) {
1442           output_matrix_reuse[i].index     = j;
1443           output_matrix_reuse[i].is_input  = true;
1444           output_matrix_reuse[i].eval_mode = eval_mode_j;
1445         } else {
1446           // For non-tensor can only re-use with the same eval mode
1447           if (eval_mode_i == eval_mode_j) {
1448             output_matrix_reuse[i].index     = j;
1449             output_matrix_reuse[i].is_input  = true;
1450             output_matrix_reuse[i].eval_mode = eval_mode_j;
1451           }
1452         }
1453       }
1454       CeedCallBackend(CeedBasisDestroy(&basis_j));
1455     }
1456     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1457       CeedEvalMode eval_mode_j;
1458       CeedBasis    basis_j;
1459 
1460       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1461       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1462       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1463       CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1464       if (basis_i == basis_j) {
1465         if (is_tensor) {
1466           output_matrix_reuse[i].index     = j;
1467           output_matrix_reuse[i].is_input  = false;
1468           output_matrix_reuse[i].eval_mode = eval_mode_j;
1469         } else {
1470           // For non-tensor can only re-use with the same eval mode
1471           if (eval_mode_i == eval_mode_j) {
1472             output_matrix_reuse[i].index     = j;
1473             output_matrix_reuse[i].is_input  = false;
1474             output_matrix_reuse[i].eval_mode = eval_mode_j;
1475           }
1476         }
1477       }
1478       CeedCallBackend(CeedBasisDestroy(&basis_j));
1479     }
1480     CeedCallBackend(CeedBasisDestroy(&basis_i));
1481   }
1482 
1483   // Initialize constants, and matrices B and G
1484   code << "\n" << tab << "// Input field constants and basis data\n";
1485   for (CeedInt i = 0; i < num_input_fields; i++) {
1486     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1487                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
1488   }
1489   code << "\n" << tab << "// Output field constants and basis data\n";
1490   for (CeedInt i = 0; i < num_output_fields; i++) {
1491     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1492                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
1493   }
1494 
1495   // Loop over all elements
1496   code << "\n" << tab << "// Element loop\n";
1497   code << tab << "__syncthreads();\n";
1498   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1499   tab.push();
1500 
1501   // -- Compute minimum buffer space needed
1502   CeedInt max_rstr_buffer_size = 1;
1503 
1504   for (CeedInt i = 0; i < num_input_fields; i++) {
1505     CeedEvalMode eval_mode;
1506 
1507     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1508     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1509       CeedInt             num_comp;
1510       CeedElemRestriction elem_rstr;
1511 
1512       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1513       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1514       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1515       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1516     }
1517   }
1518   for (CeedInt i = 0; i < num_output_fields; i++) {
1519     CeedEvalMode eval_mode;
1520 
1521     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1522     if (eval_mode != CEED_EVAL_NONE) {
1523       CeedInt             num_comp;
1524       CeedElemRestriction elem_rstr;
1525 
1526       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1527       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1528       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1529       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1530     }
1531   }
1532   code << tab << "// Scratch restriction buffer space\n";
1533   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1534 
1535   // -- Determine best input field processing order
1536   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1537 
1538   for (CeedInt i = 0; i < num_input_fields; i++) {
1539     field_rstr_in_buffer[i] = -1;
1540     input_field_order[i]    = -1;
1541   }
1542   {
1543     bool    is_ordered[CEED_FIELD_MAX];
1544     CeedInt curr_index = 0;
1545 
1546     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1547     for (CeedInt i = 0; i < num_input_fields; i++) {
1548       CeedVector          vec_i;
1549       CeedElemRestriction rstr_i;
1550 
1551       if (is_ordered[i]) continue;
1552       field_rstr_in_buffer[i]       = i;
1553       is_ordered[i]                 = true;
1554       input_field_order[curr_index] = i;
1555       curr_index++;
1556       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1557       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1558       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1559       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1560         CeedVector          vec_j;
1561         CeedElemRestriction rstr_j;
1562 
1563         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1564         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1565         if (rstr_i == rstr_j && vec_i == vec_j) {
1566           field_rstr_in_buffer[j]       = i;
1567           is_ordered[j]                 = true;
1568           input_field_order[curr_index] = j;
1569           curr_index++;
1570         }
1571         CeedCallBackend(CeedVectorDestroy(&vec_j));
1572         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1573       }
1574       CeedCallBackend(CeedVectorDestroy(&vec_i));
1575       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1576     }
1577   }
1578 
1579   // -- Input restriction and basis
1580   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1581   for (CeedInt i = 0; i < num_input_fields; i++) {
1582     const char   *field_name;
1583     const CeedInt f = input_field_order[i];
1584 
1585     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1586     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1587 
1588     // ---- Restriction
1589     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1590                                                                max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1591 
1592     // ---- Basis action
1593     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1594                                                          is_all_tensor, is_at_points, use_3d_slices));
1595   }
1596 
1597   // -- Q function
1598   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1599                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1600                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices, false));
1601 
1602   // -- Output basis and restriction
1603   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
1604   for (CeedInt i = 0; i < num_output_fields; i++) {
1605     const char *field_name;
1606 
1607     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1608     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1609 
1610     // ---- Basis action
1611     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
1612                                                          is_all_tensor, is_at_points, use_3d_slices));
1613 
1614     // ---- Restriction
1615     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
1616                                                                false, is_all_tensor, is_at_points, use_3d_slices));
1617   }
1618 
1619   // Close loop and function
1620   tab.pop();
1621   code << tab << "}\n";
1622   tab.pop();
1623   code << tab << "}\n";
1624   code << tab << "// -----------------------------------------------------------------------------\n\n";
1625 
1626   CeedInt block_sizes[3] = {0, 0, 0};
1627   CeedInt num_elem;
1628 
1629   // Compile
1630   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
1631   CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
1632   {
1633     bool is_compile_good = false;
1634 
1635     data->thread_1d = block_sizes[0];
1636     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE",
1637                                        block_sizes[0] * block_sizes[1] * block_sizes[2]));
1638     if (is_compile_good) {
1639       *is_good_build = true;
1640       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
1641     } else {
1642       *is_good_build     = false;
1643       data->use_fallback = true;
1644     }
1645   }
1646   CeedCallBackend(CeedOperatorSetSetupDone(op));
1647   CeedCallBackend(CeedDestroy(&ceed));
1648   CeedCallBackend(CeedQFunctionDestroy(&qf));
1649   return CEED_ERROR_SUCCESS;
1650 }
1651 
1652 //------------------------------------------------------------------------------
1653 // Build AtPoints assembly operator kernel
1654 //------------------------------------------------------------------------------
1655 static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) {
1656   bool                   is_all_tensor = true, is_at_points = false, use_3d_slices = false;
1657   Ceed                   ceed;
1658   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1659   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1660   CeedQFunction_Hip_gen *qf_data;
1661   CeedQFunction          qf;
1662   CeedOperatorField     *op_input_fields, *op_output_fields;
1663   CeedOperator_Hip_gen  *data;
1664   std::ostringstream     code;
1665   Tab                    tab;
1666 
1667   // Check compatibility
1668   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1669   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1670   CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
1671 
1672   // Retrieve operator data
1673   CeedCallBackend(CeedOperatorGetData(op, &data));
1674   Q       = data->Q;
1675   Q_1d    = data->Q_1d;
1676   max_dim = data->dim;
1677   {
1678     CeedElemRestriction rstr_points = NULL;
1679 
1680     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1681     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1682     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1683     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1684   }
1685   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1686   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1687   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1688   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1689 
1690   // Load basis source files
1691   code << tab << "// Tensor basis source\n";
1692   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1693   code << tab << "// AtPoints basis source\n";
1694   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1695   code << tab << "// CodeGen operator source\n";
1696   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1697 
1698   // Get QFunction name
1699   std::string qfunction_name(qf_data->qfunction_name);
1700   std::string operator_name;
1701 
1702   if (is_full) {
1703     operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name;
1704   } else {
1705     operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name;
1706   }
1707 
1708   // Define CEED_Q_VLA
1709   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1710   code << tab << "#define CEED_Q_VLA 1\n\n";
1711 
1712   // Add user QFunction source
1713   {
1714     const char *source_path;
1715 
1716     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1717     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1718 
1719     code << tab << "// User QFunction source\n";
1720     code << tab << "#include \"" << source_path << "\"\n\n";
1721   }
1722 
1723   // Setup
1724   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1725   code << tab << "// Operator Assembly Kernel\n";
1726   code << tab << "// \n";
1727   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1728   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1729   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1730   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1731   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1732   code << tab << "// \n";
1733   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1734   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1735   code << tab << "// -----------------------------------------------------------------------------\n";
1736   code << tab << "extern \"C\" __global__ void " << operator_name
1737        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
1738           "points, CeedScalar *__restrict__ values_array) {\n";
1739   tab.push();
1740 
1741   // Scratch buffers
1742   for (CeedInt i = 0; i < num_input_fields; i++) {
1743     CeedEvalMode eval_mode;
1744 
1745     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1746     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1747       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1748     }
1749   }
1750   for (CeedInt i = 0; i < num_output_fields; i++) {
1751     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1752   }
1753 
1754   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1755   code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1756   code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1757   code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1758 
1759   // Shared data
1760   code << tab << "extern __shared__ CeedScalar slice[];\n";
1761   code << tab << "SharedData_Hip data;\n";
1762   code << tab << "data.t_id_x = threadIdx.x;\n";
1763   code << tab << "data.t_id_y = threadIdx.y;\n";
1764   code << tab << "data.t_id_z = threadIdx.z;\n";
1765   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1766   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1767 
1768   // -- Determine input mat reuse
1769   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1770 
1771   for (CeedInt i = 0; i < num_input_fields; i++) {
1772     input_matrix_reuse[i].index = -1;
1773   }
1774   for (CeedInt i = 0; i < num_input_fields; i++) {
1775     CeedEvalMode eval_mode_i;
1776     CeedBasis    basis_i;
1777 
1778     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1779     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1780     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1781     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1782       CeedEvalMode eval_mode_j;
1783       CeedBasis    basis_j;
1784 
1785       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1786       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1787       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1788       if (basis_i == basis_j) {
1789         input_matrix_reuse[i].index     = j;
1790         input_matrix_reuse[i].is_input  = true;
1791         input_matrix_reuse[i].eval_mode = eval_mode_j;
1792       }
1793       CeedCallBackend(CeedBasisDestroy(&basis_j));
1794     }
1795     CeedCallBackend(CeedBasisDestroy(&basis_i));
1796   }
1797 
1798   // -- Determine output mat reuse
1799   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1800 
1801   for (CeedInt i = 0; i < num_output_fields; i++) {
1802     output_matrix_reuse[i].index = -1;
1803   }
1804   for (CeedInt i = 0; i < num_output_fields; i++) {
1805     CeedEvalMode eval_mode_i;
1806     CeedBasis    basis_i;
1807 
1808     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1809     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1810     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1811       CeedEvalMode eval_mode_j;
1812       CeedBasis    basis_j;
1813 
1814       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1815       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1816       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1817       if (basis_i == basis_j) {
1818         output_matrix_reuse[i].index     = j;
1819         output_matrix_reuse[i].is_input  = true;
1820         output_matrix_reuse[i].eval_mode = eval_mode_j;
1821       }
1822       CeedCallBackend(CeedBasisDestroy(&basis_j));
1823     }
1824     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1825       CeedEvalMode eval_mode_j;
1826       CeedBasis    basis_j;
1827 
1828       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1829       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1830       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1831       if (basis_i == basis_j) {
1832         output_matrix_reuse[i].index     = j;
1833         output_matrix_reuse[i].is_input  = false;
1834         output_matrix_reuse[i].eval_mode = eval_mode_j;
1835       }
1836       CeedCallBackend(CeedBasisDestroy(&basis_j));
1837     }
1838     CeedCallBackend(CeedBasisDestroy(&basis_i));
1839   }
1840 
1841   // Initialize constants, and matrices B and G
1842   code << "\n" << tab << "// Input field constants and basis data\n";
1843   for (CeedInt i = 0; i < num_input_fields; i++) {
1844     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1845                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
1846   }
1847   code << "\n" << tab << "// Output field constants and basis data\n";
1848   for (CeedInt i = 0; i < num_output_fields; i++) {
1849     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1850                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
1851   }
1852 
1853   // Loop over all elements
1854   code << "\n" << tab << "// Element loop\n";
1855   code << tab << "__syncthreads();\n";
1856   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1857   tab.push();
1858 
1859   // -- Compute minimum buffer space needed
1860   CeedInt max_rstr_buffer_size = 1;
1861 
1862   for (CeedInt i = 0; i < num_input_fields; i++) {
1863     CeedEvalMode eval_mode;
1864 
1865     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1866     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1867       CeedInt             num_comp;
1868       CeedElemRestriction elem_rstr;
1869 
1870       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1871       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1872       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1873       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1874     }
1875   }
1876   for (CeedInt i = 0; i < num_output_fields; i++) {
1877     CeedEvalMode eval_mode;
1878 
1879     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1880     if (eval_mode != CEED_EVAL_NONE) {
1881       CeedInt             num_comp;
1882       CeedElemRestriction elem_rstr;
1883 
1884       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1885       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1886       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1887       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1888     }
1889   }
1890   code << tab << "// Scratch restriction buffer space\n";
1891   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1892 
1893   // -- Determine best input field processing order
1894   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1895 
1896   for (CeedInt i = 0; i < num_input_fields; i++) {
1897     field_rstr_in_buffer[i] = -1;
1898     input_field_order[i]    = -1;
1899   }
1900   {
1901     bool    is_ordered[CEED_FIELD_MAX];
1902     CeedInt curr_index = 0;
1903 
1904     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1905     for (CeedInt i = 0; i < num_input_fields; i++) {
1906       CeedVector          vec_i;
1907       CeedElemRestriction rstr_i;
1908 
1909       if (is_ordered[i]) continue;
1910       field_rstr_in_buffer[i]       = i;
1911       is_ordered[i]                 = true;
1912       input_field_order[curr_index] = i;
1913       curr_index++;
1914       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1915       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1916       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1917       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1918         CeedVector          vec_j;
1919         CeedElemRestriction rstr_j;
1920 
1921         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1922         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1923         if (rstr_i == rstr_j && vec_i == vec_j) {
1924           field_rstr_in_buffer[j]       = i;
1925           is_ordered[j]                 = true;
1926           input_field_order[curr_index] = j;
1927           curr_index++;
1928         }
1929         CeedCallBackend(CeedVectorDestroy(&vec_j));
1930         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1931       }
1932       CeedCallBackend(CeedVectorDestroy(&vec_i));
1933       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1934     }
1935   }
1936 
1937   // -- Input restriction and basis
1938   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1939   CeedInt active_field_index = -1;
1940 
1941   for (CeedInt i = 0; i < num_input_fields; i++) {
1942     bool          is_active = false;
1943     const char   *field_name;
1944     const CeedInt f = input_field_order[i];
1945 
1946     {
1947       CeedVector vec;
1948 
1949       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1950       is_active = vec == CEED_VECTOR_ACTIVE;
1951       CeedCallBackend(CeedVectorDestroy(&vec));
1952     }
1953 
1954     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1955     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1956 
1957     if (is_active) {
1958       std::string var_suffix = "_in_" + std::to_string(f);
1959 
1960       code << tab << "// Active field - no restriction or basis action here\n";
1961       if (active_field_index == -1) {
1962         active_field_index = f;
1963         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
1964              << "] = {0.0};\n";
1965       } else {
1966         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
1967       }
1968     } else {
1969       // ---- Restriction
1970       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1971                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1972 
1973       // ---- Basis action
1974       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1975                                                            is_all_tensor, is_at_points, use_3d_slices));
1976     }
1977   }
1978 
1979   // -- Loop over active field
1980   std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
1981 
1982   code << "\n" << tab << "// Loop over nodes in active field\n";
1983   code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
1984        << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
1985   tab.push();
1986 
1987   // -- Set current active node and component to 1
1988   code << tab << "// Set current active node and component to 1.0\n";
1989   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
1990        << active_var_suffix << ");\n\n";
1991 
1992   for (CeedInt i = 0; i < num_input_fields; i++) {
1993     bool          is_active = false;
1994     const char   *field_name;
1995     const CeedInt f = input_field_order[i];
1996 
1997     {
1998       CeedVector vec;
1999 
2000       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
2001       is_active = vec == CEED_VECTOR_ACTIVE;
2002       CeedCallBackend(CeedVectorDestroy(&vec));
2003     }
2004     if (!is_active) continue;
2005 
2006     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
2007     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
2008 
2009     // ---- Basis action
2010     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
2011                                                          is_all_tensor, is_at_points, use_3d_slices));
2012   }
2013 
2014   // -- Q function
2015   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
2016                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
2017                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
2018 
2019   // -- Output basis and restriction
2020   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
2021   for (CeedInt i = 0; i < num_output_fields; i++) {
2022     bool        is_active = false;
2023     const char *field_name;
2024 
2025     {
2026       CeedVector vec;
2027 
2028       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2029       is_active = vec == CEED_VECTOR_ACTIVE;
2030       CeedCallBackend(CeedVectorDestroy(&vec));
2031     }
2032     if (!is_active) continue;
2033 
2034     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2035     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2036 
2037     // ---- Basis action
2038     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
2039                                                          is_all_tensor, is_at_points, use_3d_slices));
2040 
2041     // ---- Restriction
2042     if (is_full) {
2043       std::string         var_suffix = "_out_" + std::to_string(i);
2044       CeedInt             comp_stride;
2045       CeedSize            l_size;
2046       CeedElemRestriction elem_rstr;
2047 
2048       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2049       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2050       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2051       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2052       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2053       code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2054            << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
2055       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2056     } else {
2057       std::string         var_suffix = "_out_" + std::to_string(i);
2058       CeedInt             comp_stride;
2059       CeedSize            l_size;
2060       CeedElemRestriction elem_rstr;
2061 
2062       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2063       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2064       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2065       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2066       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2067       code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2068            << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
2069       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2070     }
2071   }
2072 
2073   // -- Reset current active node and component
2074   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2075   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
2076        << active_var_suffix << ");\n";
2077 
2078   // -- End of loop over active field
2079   tab.pop();
2080   code << tab << "}\n";
2081 
2082   // Close loop and function
2083   tab.pop();
2084   code << tab << "}\n";
2085   tab.pop();
2086   code << tab << "}\n";
2087   code << tab << "// -----------------------------------------------------------------------------\n\n";
2088 
2089   CeedInt block_sizes[3] = {0, 0, 0};
2090   CeedInt num_elem;
2091 
2092   // Compile
2093   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2094   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2095   {
2096     bool is_compile_good = false;
2097 
2098     data->thread_1d = block_sizes[0];
2099     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good,
2100                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0],
2101                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2102     if (is_compile_good) {
2103       *is_good_build = true;
2104       CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
2105                                         is_full ? &data->assemble_full : &data->assemble_diagonal));
2106     } else {
2107       *is_good_build              = false;
2108       data->use_assembly_fallback = true;
2109     }
2110   }
2111   CeedCallBackend(CeedDestroy(&ceed));
2112   CeedCallBackend(CeedQFunctionDestroy(&qf));
2113   return CEED_ERROR_SUCCESS;
2114 }
2115 
2116 extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2117   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build);
2118 }
2119 
2120 extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2121   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build);
2122 }
2123 //------------------------------------------------------------------------------
2124 // Build QFunction assembly operator kernel
2125 //------------------------------------------------------------------------------
2126 extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) {
2127   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
2128   Ceed                   ceed;
2129   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0;
2130   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
2131   CeedQFunction_Hip_gen *qf_data;
2132   CeedQFunction          qf;
2133   CeedOperatorField     *op_input_fields, *op_output_fields;
2134   CeedOperator_Hip_gen  *data;
2135   std::ostringstream     code;
2136   Tab                    tab;
2137 
2138   // Check compatibility
2139   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
2140   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
2141   CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported");
2142 
2143   // Check field compatibility
2144   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
2145   {
2146     bool has_shared_bases = true;
2147 
2148     for (CeedInt i = 0; i < num_input_fields; i++) {
2149       CeedBasis basis;
2150 
2151       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
2152       if (basis != CEED_BASIS_NONE) {
2153         bool        is_tensor = true;
2154         const char *resource;
2155         char       *resource_root;
2156         Ceed        basis_ceed;
2157 
2158         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2159         is_all_tensor    = is_all_tensor && is_tensor;
2160         is_all_nontensor = is_all_nontensor && !is_tensor;
2161         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2162         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2163         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2164         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2165         CeedCallBackend(CeedFree(&resource_root));
2166         CeedCallBackend(CeedDestroy(&basis_ceed));
2167       }
2168       CeedCallBackend(CeedBasisDestroy(&basis));
2169     }
2170 
2171     for (CeedInt i = 0; i < num_output_fields; i++) {
2172       CeedBasis basis;
2173 
2174       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
2175       if (basis != CEED_BASIS_NONE) {
2176         bool        is_tensor = true;
2177         const char *resource;
2178         char       *resource_root;
2179         Ceed        basis_ceed;
2180 
2181         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2182         is_all_tensor    = is_all_tensor && is_tensor;
2183         is_all_nontensor = is_all_nontensor && !is_tensor;
2184 
2185         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2186         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2187         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2188         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2189         CeedCallBackend(CeedFree(&resource_root));
2190         CeedCallBackend(CeedDestroy(&basis_ceed));
2191       }
2192       CeedCallBackend(CeedBasisDestroy(&basis));
2193     }
2194   }
2195 
2196   // Retrieve operator data
2197   CeedCallBackend(CeedOperatorGetData(op, &data));
2198   Q       = data->Q;
2199   Q_1d    = data->Q_1d;
2200   max_dim = data->dim;
2201   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
2202   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
2203   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
2204 
2205   // Load basis source files
2206   if (!is_all_nontensor) {
2207     code << tab << "// Tensor basis source\n";
2208     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
2209   }
2210   if (!is_all_tensor) {
2211     code << tab << "// Non-tensor basis source\n";
2212     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
2213   }
2214   if (!is_all_tensor && !is_all_nontensor) {
2215     code << "// Tensor basis source\n";
2216     code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
2217   }
2218   code << "// CodeGen operator source\n";
2219   code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
2220 
2221   // Get QFunction name
2222   std::string qfunction_name(qf_data->qfunction_name);
2223   std::string operator_name;
2224 
2225   operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name;
2226 
2227   // Define CEED_Q_VLA
2228   code << "\n" << tab << "#undef CEED_Q_VLA\n";
2229   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
2230     code << tab << "#define CEED_Q_VLA 1\n\n";
2231   } else {
2232     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
2233   }
2234 
2235   // Add user QFunction source
2236   {
2237     const char *source_path;
2238 
2239     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
2240     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
2241 
2242     code << tab << "// User QFunction source\n";
2243     code << tab << "#include \"" << source_path << "\"\n\n";
2244   }
2245 
2246   // Setup
2247   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
2248   code << tab << "// Operator Assembly Kernel\n";
2249   code << tab << "// \n";
2250   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
2251   code << tab << "// r_[in,out]_e_i: Element vector register\n";
2252   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
2253   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
2254   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
2255   code << tab << "// \n";
2256   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
2257   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
2258   code << tab << "// -----------------------------------------------------------------------------\n";
2259   code << tab << "extern \"C\" __global__ void " << operator_name
2260        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
2261           "points, CeedScalar *__restrict__ values_array) {\n";
2262   tab.push();
2263 
2264   // Scratch buffers
2265   for (CeedInt i = 0; i < num_input_fields; i++) {
2266     CeedEvalMode eval_mode;
2267 
2268     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2269     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
2270       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
2271     }
2272   }
2273   for (CeedInt i = 0; i < num_output_fields; i++) {
2274     bool is_active = false;
2275 
2276     {
2277       CeedVector vec;
2278 
2279       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2280       is_active = vec == CEED_VECTOR_ACTIVE;
2281       CeedCallBackend(CeedVectorDestroy(&vec));
2282     }
2283     if (is_active) {
2284       code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
2285     }
2286   }
2287 
2288   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
2289   if (!is_all_tensor) {
2290     code << tab << "const CeedInt Q = " << Q << ";\n";
2291   }
2292   if (!is_all_nontensor) {
2293     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
2294   }
2295 
2296   // Shared data
2297   code << tab << "extern __shared__ CeedScalar slice[];\n";
2298   code << tab << "SharedData_Hip data;\n";
2299   code << tab << "data.t_id_x = threadIdx.x;\n";
2300   code << tab << "data.t_id_y = threadIdx.y;\n";
2301   code << tab << "data.t_id_z = threadIdx.z;\n";
2302   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
2303   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
2304 
2305   // -- Determine input mat reuse
2306   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
2307 
2308   for (CeedInt i = 0; i < num_input_fields; i++) {
2309     input_matrix_reuse[i].index = -1;
2310   }
2311   for (CeedInt i = 0; i < num_input_fields; i++) {
2312     bool         is_tensor = true;
2313     CeedEvalMode eval_mode_i;
2314     CeedBasis    basis_i;
2315 
2316     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
2317     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
2318     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
2319     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2320     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
2321       CeedEvalMode eval_mode_j;
2322       CeedBasis    basis_j;
2323 
2324       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2325       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2326       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2327       if (basis_i == basis_j) {
2328         if (is_tensor) {
2329           input_matrix_reuse[i].index     = j;
2330           input_matrix_reuse[i].is_input  = true;
2331           input_matrix_reuse[i].eval_mode = eval_mode_j;
2332         } else {
2333           // For non-tensor can only re-use with the same eval mode
2334           if (eval_mode_i == eval_mode_j) {
2335             input_matrix_reuse[i].index     = j;
2336             input_matrix_reuse[i].is_input  = true;
2337             input_matrix_reuse[i].eval_mode = eval_mode_j;
2338           }
2339         }
2340       }
2341       CeedCallBackend(CeedBasisDestroy(&basis_j));
2342     }
2343     CeedCallBackend(CeedBasisDestroy(&basis_i));
2344   }
2345 
2346   // -- Determine output mat reuse
2347   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
2348 
2349   for (CeedInt i = 0; i < num_output_fields; i++) {
2350     output_matrix_reuse[i].index = -1;
2351   }
2352   for (CeedInt i = 0; i < num_output_fields; i++) {
2353     bool         is_tensor = true;
2354     CeedEvalMode eval_mode_i;
2355     CeedBasis    basis_i;
2356 
2357     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
2358     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
2359     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2360     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
2361       CeedEvalMode eval_mode_j;
2362       CeedBasis    basis_j;
2363 
2364       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2365       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2366       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2367       if (basis_i == basis_j) {
2368         if (is_tensor) {
2369           output_matrix_reuse[i].index     = j;
2370           output_matrix_reuse[i].is_input  = true;
2371           output_matrix_reuse[i].eval_mode = eval_mode_j;
2372         } else {
2373           // For non-tensor can only re-use with the same eval mode
2374           if (eval_mode_i == eval_mode_j) {
2375             output_matrix_reuse[i].index     = j;
2376             output_matrix_reuse[i].is_input  = true;
2377             output_matrix_reuse[i].eval_mode = eval_mode_j;
2378           }
2379         }
2380       }
2381       CeedCallBackend(CeedBasisDestroy(&basis_j));
2382     }
2383     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
2384       CeedEvalMode eval_mode_j;
2385       CeedBasis    basis_j;
2386 
2387       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
2388       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2389       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
2390       if (basis_i == basis_j) {
2391         if (is_tensor) {
2392           output_matrix_reuse[i].index     = j;
2393           output_matrix_reuse[i].is_input  = false;
2394           output_matrix_reuse[i].eval_mode = eval_mode_j;
2395         } else {
2396           // For non-tensor can only re-use with the same eval mode
2397           if (eval_mode_i == eval_mode_j) {
2398             output_matrix_reuse[i].index     = j;
2399             output_matrix_reuse[i].is_input  = false;
2400             output_matrix_reuse[i].eval_mode = eval_mode_j;
2401           }
2402         }
2403       }
2404       CeedCallBackend(CeedBasisDestroy(&basis_j));
2405     }
2406     CeedCallBackend(CeedBasisDestroy(&basis_i));
2407   }
2408 
2409   // Initialize constants, and matrices B and G
2410   code << "\n" << tab << "// Input field constants and basis data\n";
2411   for (CeedInt i = 0; i < num_input_fields; i++) {
2412     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
2413                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true));
2414   }
2415   code << "\n" << tab << "// Output field constants and basis data\n";
2416   for (CeedInt i = 0; i < num_output_fields; i++) {
2417     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
2418                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true));
2419   }
2420 
2421   // Loop over all elements
2422   code << "\n" << tab << "// Element loop\n";
2423   code << tab << "__syncthreads();\n";
2424   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
2425   tab.push();
2426 
2427   // -- Compute minimum buffer space needed
2428   CeedInt max_rstr_buffer_size = 1;
2429 
2430   for (CeedInt i = 0; i < num_input_fields; i++) {
2431     CeedEvalMode eval_mode;
2432 
2433     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2434     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
2435       CeedInt             num_comp;
2436       CeedElemRestriction elem_rstr;
2437 
2438       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
2439       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2440       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2441       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2442     }
2443   }
2444   for (CeedInt i = 0; i < num_output_fields; i++) {
2445     CeedEvalMode eval_mode;
2446 
2447     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
2448     if (eval_mode != CEED_EVAL_NONE) {
2449       CeedInt             num_comp;
2450       CeedElemRestriction elem_rstr;
2451 
2452       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2453       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2454       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2455       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2456     }
2457   }
2458   code << tab << "// Scratch restriction buffer space\n";
2459   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
2460 
2461   // -- Determine best input field processing order
2462   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
2463 
2464   for (CeedInt i = 0; i < num_input_fields; i++) {
2465     field_rstr_in_buffer[i] = -1;
2466     input_field_order[i]    = -1;
2467   }
2468   {
2469     bool    is_ordered[CEED_FIELD_MAX];
2470     CeedInt curr_index = 0;
2471 
2472     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
2473     for (CeedInt i = 0; i < num_input_fields; i++) {
2474       CeedVector          vec_i;
2475       CeedElemRestriction rstr_i;
2476 
2477       if (is_ordered[i]) continue;
2478       field_rstr_in_buffer[i]       = i;
2479       is_ordered[i]                 = true;
2480       input_field_order[curr_index] = i;
2481       curr_index++;
2482       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
2483       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
2484       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
2485       for (CeedInt j = i + 1; j < num_input_fields; j++) {
2486         CeedVector          vec_j;
2487         CeedElemRestriction rstr_j;
2488 
2489         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
2490         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
2491         if (rstr_i == rstr_j && vec_i == vec_j) {
2492           field_rstr_in_buffer[j]       = i;
2493           is_ordered[j]                 = true;
2494           input_field_order[curr_index] = j;
2495           curr_index++;
2496         }
2497         CeedCallBackend(CeedVectorDestroy(&vec_j));
2498         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
2499       }
2500       CeedCallBackend(CeedVectorDestroy(&vec_i));
2501       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
2502     }
2503   }
2504 
2505   // -- Input restriction and basis
2506   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
2507   CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0;
2508   CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX];
2509 
2510   for (CeedInt i = 0; i < num_input_fields; i++) {
2511     bool          is_active = false;
2512     const char   *field_name;
2513     const CeedInt f = input_field_order[i];
2514 
2515     {
2516       CeedVector vec;
2517 
2518       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
2519       is_active = vec == CEED_VECTOR_ACTIVE;
2520       CeedCallBackend(CeedVectorDestroy(&vec));
2521     }
2522 
2523     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
2524     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
2525 
2526     if (is_active) {
2527       CeedEvalMode eval_mode;
2528       CeedInt      field_size;
2529 
2530       active_fields_in[num_active_in] = f;
2531       num_active_in++;
2532       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
2533       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
2534       if (eval_mode == CEED_EVAL_GRAD) {
2535         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
2536              << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2537       } else {
2538         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2539       }
2540       code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n";
2541     } else {
2542       // ---- Restriction
2543       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
2544                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
2545 
2546       // ---- Basis action
2547       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
2548                                                            is_all_tensor, is_at_points, use_3d_slices));
2549     }
2550   }
2551   code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {";
2552   for (CeedInt i = 0; i < num_active_in; i++) {
2553     code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2554   }
2555   code << "};\n";
2556   code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {";
2557   for (CeedInt i = 0; i < num_active_in; i++) {
2558     code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2559   }
2560   code << "};\n";
2561 
2562   for (CeedInt i = 0; i < num_output_fields; i++) {
2563     bool is_active = false;
2564 
2565     {
2566       CeedVector vec;
2567 
2568       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2569       is_active = vec == CEED_VECTOR_ACTIVE;
2570       CeedCallBackend(CeedVectorDestroy(&vec));
2571     }
2572     if (is_active) {
2573       const char *field_name;
2574       CeedInt     field_size;
2575 
2576       active_fields_out[num_active_out] = i;
2577       num_active_out++;
2578       CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2579       qf_assembly_size_out += field_size;
2580       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2581       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2582       code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n";
2583     }
2584   }
2585   code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {";
2586   for (CeedInt i = 0; i < num_active_out; i++) {
2587     code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : "");
2588   }
2589   code << "};\n";
2590   code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n";
2591 
2592   // -- Loop over active field
2593   code << "\n" << tab << "CeedInt input_offset = 0;\n";
2594   code << tab << "// Loop over active QFunction input fields\n";
2595   code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n";
2596   code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n";
2597   tab.push();
2598 
2599   // -- Loop over size of active field
2600   code << "\n" << tab << "// Loop over current active input field size\n";
2601   code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n";
2602   code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n";
2603   tab.push();
2604 
2605   // -- Set current active point and component to 1
2606   code << tab << "// Set current active point and component to 1.0\n";
2607   if (is_all_tensor && (max_dim >= 3)) {
2608     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n";
2609   } else {
2610     code << tab << "r_q_in[a][s] = 1.0;\n";
2611   }
2612 
2613   // -- Q function
2614   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
2615                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
2616                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
2617 
2618   // -- Output basis and restriction
2619   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
2620   CeedScalar offset = 0;
2621 
2622   for (CeedInt i = 0; i < num_output_fields; i++) {
2623     bool        is_active = false;
2624     const char *field_name;
2625 
2626     {
2627       CeedVector vec;
2628 
2629       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2630       is_active = vec == CEED_VECTOR_ACTIVE;
2631       CeedCallBackend(CeedVectorDestroy(&vec));
2632     }
2633     if (!is_active) continue;
2634 
2635     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2636     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2637 
2638     // ---- Restriction
2639     CeedInt field_size;
2640 
2641     code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
2642          << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
2643     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2644     offset += field_size;
2645   }
2646 
2647   // -- Reset current active node and component
2648   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2649   if (is_all_tensor && (max_dim >= 3)) {
2650     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n";
2651   } else {
2652     code << tab << "r_q_in[a][s] = 0.0;\n";
2653   }
2654 
2655   // -- End of loop over size of active field
2656   tab.pop();
2657   code << tab << "}\n";
2658   code << tab << "input_offset += field_size_in;\n";
2659 
2660   // -- End of loop over active field
2661   tab.pop();
2662   code << tab << "}\n";
2663 
2664   // Close loop and function
2665   tab.pop();
2666   code << tab << "}\n";
2667   tab.pop();
2668   code << tab << "}\n";
2669   code << tab << "// -----------------------------------------------------------------------------\n\n";
2670 
2671   CeedInt block_sizes[3] = {0, 0, 0};
2672   CeedInt num_elem;
2673 
2674   // Compile
2675   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2676   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2677   {
2678     bool is_compile_good = false;
2679 
2680     data->thread_1d = block_sizes[0];
2681     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0],
2682                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2683     if (is_compile_good) {
2684       *is_good_build = true;
2685       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
2686     } else {
2687       *is_good_build              = false;
2688       data->use_assembly_fallback = true;
2689     }
2690   }
2691   CeedCallBackend(CeedDestroy(&ceed));
2692   CeedCallBackend(CeedQFunctionDestroy(&qf));
2693   return CEED_ERROR_SUCCESS;
2694 }
2695 
2696 //------------------------------------------------------------------------------
2697