xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision ca1da9b9b9d880e22f5c01cec39b447b56f5eccd)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/gen-tools.h>
13 #include <ceed/jit-tools.h>
14 
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 
19 #include "../hip-ref/ceed-hip-ref.h"
20 #include "../hip-shared/ceed-hip-shared.h"
21 #include "../hip/ceed-hip-common.h"
22 #include "../hip/ceed-hip-compile.h"
23 #include "ceed-hip-gen.h"
24 
25 struct FieldReuse_Hip {
26   CeedInt      index;
27   bool         is_input;
28   CeedEvalMode eval_mode;
29 };
30 
31 //------------------------------------------------------------------------------
32 // Calculate the block size used for launching the operator kernel
33 //------------------------------------------------------------------------------
34 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
35   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
36   if (dim == 1) {
37     CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
38 
39     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
40     block_sizes[0]  = thread_1d;
41     block_sizes[1]  = 1;
42     block_sizes[2]  = elems_per_block;
43   } else if (dim == 2) {
44     const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2;
45 
46     block_sizes[0] = thread_1d;
47     block_sizes[1] = thread_1d;
48     block_sizes[2] = elems_per_block;
49   } else if (dim == 3) {
50     const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1);
51 
52     block_sizes[0] = thread_1d;
53     block_sizes[1] = thread_1d;
54     block_sizes[2] = elems_per_block;
55   }
56   return CEED_ERROR_SUCCESS;
57 }
58 
59 //------------------------------------------------------------------------------
60 // Determine type of operator
61 //------------------------------------------------------------------------------
62 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
63                                                CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
64                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
65                                                CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
66   // Check if all are tensor
67   *is_all_tensor = true;
68   for (CeedInt i = 0; i < num_input_fields; i++) {
69     CeedBasis basis;
70 
71     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
72     if (basis != CEED_BASIS_NONE) {
73       bool is_field_tensor;
74 
75       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
76       *is_all_tensor = *is_all_tensor && is_field_tensor;
77     }
78     CeedCallBackend(CeedBasisDestroy(&basis));
79   }
80   for (CeedInt i = 0; i < num_output_fields; i++) {
81     CeedBasis basis;
82 
83     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
84     if (basis != CEED_BASIS_NONE) {
85       bool is_field_tensor;
86 
87       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
88       *is_all_tensor = *is_all_tensor && is_field_tensor;
89     }
90     CeedCallBackend(CeedBasisDestroy(&basis));
91   }
92 
93   // Find max_P, max_P_1d, Q, and Q_1d
94   bool is_all_3d = true;
95 
96   *max_P    = 0;
97   *max_P_1d = 0;
98   *Q        = 0;
99   *Q_1d     = 0;
100   for (CeedInt i = 0; i < num_input_fields; i++) {
101     CeedBasis basis;
102 
103     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
104     if (basis != CEED_BASIS_NONE) {
105       bool    is_field_tensor;
106       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
107 
108       // Check if 3D
109       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
110       is_all_3d = is_all_3d && (field_dim == 3);
111       *max_dim  = CeedIntMax(*max_dim, field_dim);
112 
113       // Collect P, P_1d, Q, and Q_1d
114       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
115       *max_P = CeedIntMax(*max_P, field_P);
116       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
117       if (is_field_tensor) {
118         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
119         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
120       }
121       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
122       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
123       *Q = field_Q;
124       if (is_field_tensor) {
125         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
126         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
127         *Q_1d = field_Q_1d;
128       }
129     }
130     CeedCallBackend(CeedBasisDestroy(&basis));
131   }
132   for (CeedInt i = 0; i < num_output_fields; i++) {
133     CeedBasis basis;
134 
135     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
136     if (basis != CEED_BASIS_NONE) {
137       bool    is_field_tensor;
138       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
139 
140       // Check if 3D
141       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
142       is_all_3d = is_all_3d && (field_dim == 3);
143       *max_dim  = CeedIntMax(*max_dim, field_dim);
144 
145       // Collect P, P_1d, Q, and Q_1d
146       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
147       *max_P = CeedIntMax(*max_P, field_P);
148       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
149       if (is_field_tensor) {
150         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
151         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
152       }
153       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
154       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
155       *Q = field_Q;
156       if (is_field_tensor) {
157         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
158         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
159         *Q_1d = field_Q_1d;
160       }
161     }
162     CeedCallBackend(CeedBasisDestroy(&basis));
163   }
164 
165   // Only use 3D collocated gradient parallelization strategy when gradient is computed
166   *use_3d_slices = false;
167   if (is_all_3d && *is_all_tensor) {
168     bool was_grad_found = false;
169 
170     for (CeedInt i = 0; i < num_input_fields; i++) {
171       CeedEvalMode eval_mode;
172 
173       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
174       if (eval_mode == CEED_EVAL_GRAD) {
175         CeedBasis_Hip_shared *basis_data;
176         CeedBasis             basis;
177 
178         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
179         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
180         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
181         was_grad_found = true;
182         CeedCallBackend(CeedBasisDestroy(&basis));
183       }
184     }
185     for (CeedInt i = 0; i < num_output_fields; i++) {
186       CeedEvalMode eval_mode;
187 
188       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
189       if (eval_mode == CEED_EVAL_GRAD) {
190         CeedBasis_Hip_shared *basis_data;
191         CeedBasis             basis;
192 
193         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
194         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
195         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
196         was_grad_found = true;
197         CeedCallBackend(CeedBasisDestroy(&basis));
198       }
199     }
200   }
201   return CEED_ERROR_SUCCESS;
202 }
203 
204 //------------------------------------------------------------------------------
205 // Setup fields
206 //------------------------------------------------------------------------------
207 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
208                                                     CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
209                                                     CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
210                                                     bool use_3d_slices, bool skip_active_load) {
211   bool      is_tensor = true, is_active = true;
212   CeedBasis basis;
213 
214   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
215   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
216   {
217     CeedVector vec;
218 
219     CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec));
220     is_active = vec == CEED_VECTOR_ACTIVE;
221     CeedCallBackend(CeedVectorDestroy(&vec));
222   }
223 
224   const char           *field_name;
225   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
226   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
227   std::string           option_name = (is_input ? "inputs" : "outputs");
228   CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
229   CeedInt               elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
230   CeedElemRestriction   elem_rstr;
231   CeedBasis_Hip_shared *basis_data;
232 
233   // Field reuse info
234   bool use_previous_field = field_reuse.index != -1;
235 
236   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
237   code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
238 
239   // Get field data
240   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
241   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
242     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
243     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
244   }
245   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
246   if (basis != CEED_BASIS_NONE) {
247     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
248     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
249     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
250     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
251   }
252   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
253 
254   // Set field constants
255   code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
256   if (is_tensor && !is_all_tensor) {
257     CeedInt P = 0;
258 
259     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
260     code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
261   }
262   code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
263   if (eval_mode != CEED_EVAL_WEIGHT) {
264     code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
265   }
266 
267   // Load basis data
268   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
269   switch (eval_mode) {
270     case CEED_EVAL_NONE:
271       break;
272     case CEED_EVAL_INTERP:
273       if (is_at_points) {
274         // AtPoints
275         if (!basis_data->d_chebyshev_interp_1d) {
276           CeedSize    interp_bytes;
277           CeedScalar *chebyshev_interp_1d;
278 
279           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
280           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
281           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
282           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
283           CeedCallHip(CeedBasisReturnCeed(basis),
284                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
285           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
286         }
287         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
288         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
289       } else {
290         // Standard quadrature
291         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
292         else data->B.outputs[i] = basis_data->d_interp_1d;
293       }
294       if (use_previous_field && !skip_active_load) {
295         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
296 
297         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
298       } else {
299         bool is_collocated = false;
300 
301         CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
302         if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
303           code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
304         } else {
305           code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
306           code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
307         }
308       }
309       break;
310     case CEED_EVAL_GRAD:
311       if (is_at_points) {
312         // AtPoints
313         if (!basis_data->d_chebyshev_interp_1d) {
314           CeedSize    interp_bytes;
315           CeedScalar *chebyshev_interp_1d;
316 
317           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
318           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
319           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
320           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
321           CeedCallHip(CeedBasisReturnCeed(basis),
322                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
323           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
324         }
325         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
326         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
327       } else {
328         // Standard quadrature
329         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
330         else data->B.outputs[i] = basis_data->d_interp_1d;
331       }
332       if (is_tensor) {
333         if (use_previous_field && !skip_active_load) {
334           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
335 
336           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
337         } else {
338           bool is_collocated = false;
339 
340           CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
341           if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
342             code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
343           } else {
344             code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
345             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
346           }
347         }
348       }
349       if (is_at_points) break;  // No G mat for AtPoints
350       if (use_3d_slices) {
351         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
352         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
353         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
354           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
355 
356           code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
357         } else if (is_active && skip_active_load) {
358           code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
359         } else {
360           code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
361           code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
362         }
363       } else {
364         bool has_collo_grad = basis_data->d_collo_grad_1d;
365 
366         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
367         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
368         if (has_collo_grad) {
369           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
370             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
371 
372             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
373           } else if (is_active && skip_active_load) {
374             code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
375           } else {
376             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
377             code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
378           }
379         } else {
380           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
381             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
382 
383             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
384           } else if (is_active && skip_active_load) {
385             code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
386           } else {
387             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
388                  << (is_tensor ? "" : var_suffix) << "];\n";
389             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
390                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
391           }
392         }
393       }
394       break;
395     case CEED_EVAL_WEIGHT:
396       break;  // No action
397       // LCOV_EXCL_START
398     case CEED_EVAL_DIV:
399     case CEED_EVAL_CURL:
400       break;  // TODO: Not implemented
401               // LCOV_EXCL_STOP
402   }
403   CeedCallBackend(CeedBasisDestroy(&basis));
404   return CEED_ERROR_SUCCESS;
405 }
406 
407 //------------------------------------------------------------------------------
408 // Restriction
409 //------------------------------------------------------------------------------
410 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
411                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
412                                                       CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
413                                                       bool use_3d_slices) {
414   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
415   std::string              P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
416   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
417   CeedInt                  elem_size = 0, num_comp = 0;
418   CeedSize                 l_size;
419   CeedRestrictionType      rstr_type = CEED_RESTRICTION_STANDARD;
420   CeedElemRestriction_Hip *rstr_data;
421   CeedElemRestriction      elem_rstr;
422 
423   // Get field data
424   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
425   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
426     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
427     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
428     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
429     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
430   }
431   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
432 
433   // Restriction
434   if (is_input) {
435     // Input
436     if (field_input_buffer[i] != i) {
437       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
438 
439       // Restriction was already done for previous input
440       code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
441     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
442       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
443         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
444         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
445       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
446         // Otherwise we're using the scratch space
447         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
448       }
449       switch (rstr_type) {
450         case CEED_RESTRICTION_STANDARD: {
451           CeedInt comp_stride;
452 
453           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
454           code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
455           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
456           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
457           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
458           code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
459                << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
460                << ");\n";
461           break;
462         }
463         case CEED_RESTRICTION_STRIDED: {
464           bool    has_backend_strides;
465           CeedInt num_elem;
466 
467           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
468           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
469           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
470 
471           if (!has_backend_strides) {
472             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
473           }
474           code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
475                << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
476           code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
477                << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
478                << var_suffix << ");\n";
479           break;
480         }
481         case CEED_RESTRICTION_POINTS: {
482           CeedInt comp_stride;
483 
484           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
485           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
486           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
487           break;
488         }
489         // LCOV_EXCL_START
490         case CEED_RESTRICTION_ORIENTED:
491         case CEED_RESTRICTION_CURL_ORIENTED:
492           break;  // TODO: Not implemented
493                   // LCOV_EXCL_STOP
494       }
495     }
496   } else {
497     // Output
498     switch (rstr_type) {
499       case CEED_RESTRICTION_STANDARD: {
500         CeedInt comp_stride;
501 
502         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
503         code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
504         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
505         code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
506         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
507         code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
508              << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
509              << ");\n";
510         break;
511       }
512       case CEED_RESTRICTION_STRIDED: {
513         bool    has_backend_strides;
514         CeedInt num_elem;
515 
516         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
517         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
518         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
519 
520         if (!has_backend_strides) {
521           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
522         }
523         code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
524              << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
525         code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
526              << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
527              << ");\n";
528         break;
529       }
530       case CEED_RESTRICTION_POINTS:
531         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
532         break;
533       // LCOV_EXCL_START
534       case CEED_RESTRICTION_ORIENTED:
535       case CEED_RESTRICTION_CURL_ORIENTED:
536         break;  // TODO: Not implemented
537                 // LCOV_EXCL_STOP
538     }
539   }
540   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
541   return CEED_ERROR_SUCCESS;
542 }
543 
544 //------------------------------------------------------------------------------
545 // Basis
546 //------------------------------------------------------------------------------
547 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field,
548                                                 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
549                                                 bool is_at_points, bool use_3d_slices) {
550   bool      is_tensor = true, is_collocated = true;
551   CeedBasis basis;
552   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
553   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
554   CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
555 
556   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
557   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
558   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
559   CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
560   CeedElemRestriction elem_rstr;
561 
562   // Get field data
563   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
564   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
565     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
566     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
567   }
568   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
569   if (basis != CEED_BASIS_NONE) {
570     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
571     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
572     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
573   }
574   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
575 
576   // Basis
577   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
578   if (is_input) {
579     switch (eval_mode) {
580       case CEED_EVAL_NONE:
581         if (!use_3d_slices && !is_at_points) {
582           code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
583         }
584         break;
585       case CEED_EVAL_INTERP:
586         if (is_at_points) {
587           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
588 
589           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
590           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
591                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
592         } else {
593           std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
594                                                    std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
595                                                 : "InterpNonTensor";
596           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
597 
598           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
599           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
600                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
601         }
602         break;
603       case CEED_EVAL_GRAD:
604         if (is_at_points) {
605           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
606 
607           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
608           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
609                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
610         } else if (use_3d_slices) {
611           std::string function_name =
612               (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d";
613 
614           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
615           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
616                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
617         } else if (is_tensor) {
618           bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
619           std::string function_name =
620               (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
621               std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
622           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
623 
624           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
625                << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
626           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
627                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
628         } else {
629           std::string function_name = "GradNonTensor";
630 
631           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
632           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
633                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
634         }
635         break;
636       case CEED_EVAL_WEIGHT: {
637         if (is_at_points) {
638           code << tab << "// Nothing to do AtPoints\n";
639         } else {
640           CeedBasis_Hip_shared *basis_data;
641           std::string           function_name = is_tensor
642                                                     ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
643                                                     : "WeightNonTensor";
644 
645           code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
646           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
647           data->W = basis_data->d_q_weight_1d;
648           code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
649         }
650         break;
651       }
652       // LCOV_EXCL_START
653       case CEED_EVAL_DIV:
654       case CEED_EVAL_CURL:
655         break;  // TODO: Not implemented
656                 // LCOV_EXCL_STOP
657     }
658   } else {
659     switch (eval_mode) {
660       case CEED_EVAL_NONE:
661         code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
662         break;  // No action
663       case CEED_EVAL_INTERP:
664         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
665         if (is_at_points) {
666           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
667 
668           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
669                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
670         } else {
671           std::string function_name =
672               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
673                            std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
674                         : "InterpTransposeNonTensor";
675           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
676 
677           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
678                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
679         }
680         break;
681       case CEED_EVAL_GRAD:
682         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
683         if (is_at_points) {
684           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
685 
686           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
687                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
688         } else if (use_3d_slices) {
689           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
690                                       std::to_string(dim) + "d";
691 
692           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
693                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
694         } else if (is_tensor) {
695           bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
696           std::string function_name =
697               (dim == 1 ? "GradTranspose"
698                         : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
699               std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
700           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
701 
702           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
703                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
704         } else {
705           std::string function_name = "GradTransposeNonTensor";
706 
707           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
708                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
709         }
710         break;
711       // LCOV_EXCL_START
712       case CEED_EVAL_WEIGHT:
713         break;  // Should not occur
714       case CEED_EVAL_DIV:
715       case CEED_EVAL_CURL:
716         break;  // TODO: Not implemented
717                 // LCOV_EXCL_STOP
718     }
719   }
720   CeedCallBackend(CeedBasisDestroy(&basis));
721   return CEED_ERROR_SUCCESS;
722 }
723 
724 //------------------------------------------------------------------------------
725 // QFunction
726 //------------------------------------------------------------------------------
727 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim,
728                                                     CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
729                                                     CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
730                                                     CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
731                                                     std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
732                                                     bool use_3d_slices) {
733   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
734   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
735   CeedElemRestriction elem_rstr;
736 
737   // Setup output arrays
738   code << "\n";
739   code << tab << "// -- Output field setup\n";
740   for (CeedInt i = 0; i < num_output_fields; i++) {
741     const char *field_name;
742     std::string var_suffix = "_out_" + std::to_string(i);
743 
744     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
745     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
746     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
747     switch (eval_mode) {
748       case CEED_EVAL_NONE:
749         if (is_at_points) {
750           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
751         } else {
752           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
753                << "];\n";
754         }
755         break;
756       case CEED_EVAL_INTERP:
757         if (is_at_points) {
758           // Accumulator for point data
759           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
760           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
761                << "[i] = 0.0;\n";
762         } else {
763           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
764                << "];\n";
765         }
766         break;
767       case CEED_EVAL_GRAD:
768         if (is_at_points) {
769           // Accumulator for point data
770           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
771           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
772                << "[i] = 0.0;\n";
773         } else if (use_3d_slices) {
774           // Accumulator for gradient slices
775           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
776           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
777         } else {
778           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
779                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
780         }
781         break;
782       case CEED_EVAL_WEIGHT:
783         break;
784         // LCOV_EXCL_START
785       case CEED_EVAL_DIV:
786       case CEED_EVAL_CURL:
787         break;  // TODO: Not implemented
788                 // LCOV_EXCL_STOP
789     }
790   }
791 
792   if (is_at_points) {
793     // We need to handle batches of points
794     code << "\n";
795     code << tab << "// Note: Using batches of points\n";
796     code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
797     code << tab << "#pragma unroll\n";
798     code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
799     tab.push();
800     code << tab << "const CeedInt p = i % max_num_points;\n\n";
801 
802     code << tab << "// -- Coordinates\n";
803     code << tab << "CeedScalar r_x[max_dim];\n";
804     code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
805 
806     code << tab << "// -- Input fields\n";
807     for (CeedInt i = 0; i < num_input_fields; i++) {
808       const char *field_name;
809       std::string var_suffix = "_in_" + std::to_string(i);
810       std::string P_name     = "P_1d" + var_suffix;
811 
812       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
813       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
814       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
815       // Basis action
816       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
817       switch (eval_mode) {
818         case CEED_EVAL_NONE:
819           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
820           code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
821                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
822           break;
823         case CEED_EVAL_INTERP:
824           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
825           code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
826                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
827           break;
828         case CEED_EVAL_GRAD:
829           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
830           code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
831                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
832           break;
833         case CEED_EVAL_WEIGHT:
834           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
835           code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
836           break;
837           // LCOV_EXCL_START
838         case CEED_EVAL_DIV:
839         case CEED_EVAL_CURL:
840           break;  // TODO: Not implemented
841                   // LCOV_EXCL_STOP
842       }
843     }
844     code << "\n";
845     code << tab << "// -- Output fields\n";
846     for (CeedInt i = 0; i < num_output_fields; i++) {
847       const char *field_name;
848       std::string var_suffix = "_out_" + std::to_string(i);
849 
850       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
851       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
852       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
853       // Basis action
854       switch (eval_mode) {
855         case CEED_EVAL_NONE:
856           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
857           break;
858         case CEED_EVAL_INTERP:
859           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
860           break;
861         case CEED_EVAL_GRAD:
862           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
863           break;
864           // LCOV_EXCL_START
865         case CEED_EVAL_WEIGHT:
866           break;  // Should not occur
867         case CEED_EVAL_DIV:
868         case CEED_EVAL_CURL:
869           break;  // TODO: Not implemented
870                   // LCOV_EXCL_STOP
871       }
872     }
873 
874   } else if (use_3d_slices) {
875     // We treat quadrature points per slice in 3d to save registers
876     code << "\n";
877     code << tab << "// Note: Using planes of 3D elements\n";
878     code << tab << "#pragma unroll\n";
879     code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
880     tab.push();
881     code << tab << "// -- Input fields\n";
882     for (CeedInt i = 0; i < num_input_fields; i++) {
883       const char *field_name;
884       std::string var_suffix = "_in_" + std::to_string(i);
885 
886       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
887       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
888       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
889       // Basis action
890       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
891       switch (eval_mode) {
892         case CEED_EVAL_NONE:
893           bool is_strided;
894 
895           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
896 
897           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
898           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
899           if (is_strided) {
900             bool    has_backend_strides;
901             CeedInt num_elem, elem_size;
902 
903             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
904             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
905             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
906             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
907 
908             if (!has_backend_strides) {
909               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
910             }
911             code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
912                  << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
913             code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
914                  << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
915           } else {
916             CeedSize                 l_size = 0;
917             CeedInt                  comp_stride;
918             CeedElemRestriction_Hip *rstr_data;
919 
920             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
921             code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
922             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
923             code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
924             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
925             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
926             code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
927                  << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
928           }
929           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
930           break;
931         case CEED_EVAL_INTERP:
932           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
933           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
934           tab.push();
935           code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
936           tab.pop();
937           code << tab << "}\n";
938           break;
939         case CEED_EVAL_GRAD:
940           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
941           code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
942                << var_suffix << ", r_s" << var_suffix << ");\n";
943           break;
944         case CEED_EVAL_WEIGHT:
945           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
946           code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
947           break;
948           // LCOV_EXCL_START
949         case CEED_EVAL_DIV:
950         case CEED_EVAL_CURL:
951           break;  // TODO: Not implemented
952                   // LCOV_EXCL_STOP
953       }
954     }
955     code << "\n";
956     code << tab << "// -- Output fields\n";
957     for (CeedInt i = 0; i < num_output_fields; i++) {
958       const char *field_name;
959       std::string var_suffix = "_out_" + std::to_string(i);
960 
961       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
962       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
963       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
964       // Basis action
965       switch (eval_mode) {
966         case CEED_EVAL_NONE:
967           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
968           break;
969         case CEED_EVAL_INTERP:
970           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
971           break;
972         case CEED_EVAL_GRAD:
973           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
974           break;
975           // LCOV_EXCL_START
976         case CEED_EVAL_WEIGHT:
977           break;  // Should not occur
978         case CEED_EVAL_DIV:
979         case CEED_EVAL_CURL:
980           break;  // TODO: Not implemented
981                   // LCOV_EXCL_STOP
982       }
983     }
984   } else {
985     code << "\n";
986     code << tab << "// Note: Using full elements\n";
987     code << tab << "{\n";
988     tab.push();
989     code << tab << "// -- Input fields\n";
990     for (CeedInt i = 0; i < num_input_fields; i++) {
991       const char *field_name;
992 
993       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
994       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
995       code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
996     }
997     code << tab << "// -- Output fields\n";
998     for (CeedInt i = 0; i < num_output_fields; i++) {
999       const char *field_name;
1000 
1001       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1002       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1003       code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
1004     }
1005   }
1006 
1007   // Input and output buffers
1008   code << "\n";
1009   code << tab << "// -- QFunction inputs and outputs\n";
1010   code << tab << "// ---- Inputs\n";
1011   code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
1012   for (CeedInt i = 0; i < num_input_fields; i++) {
1013     const char *field_name;
1014 
1015     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
1016     code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
1017     code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
1018   }
1019   code << tab << "// ---- Outputs\n";
1020   code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
1021   for (CeedInt i = 0; i < num_output_fields; i++) {
1022     const char *field_name;
1023 
1024     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1025     code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
1026     code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
1027   }
1028 
1029   // Apply QFunction
1030   code << "\n";
1031   code << tab << "// -- Apply QFunction\n";
1032   code << tab << "" << qfunction_name << "(ctx, ";
1033   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1034     code << "1";
1035   } else {
1036     code << Q_name;
1037   }
1038   code << ", inputs, outputs);\n";
1039 
1040   if (is_at_points) {
1041     // Map back to coefficients
1042     code << "\n";
1043     code << tab << "// -- Output fields\n";
1044     for (CeedInt i = 0; i < num_output_fields; i++) {
1045       const char *field_name;
1046       std::string var_suffix = "_out_" + std::to_string(i);
1047       std::string P_name     = "P_1d" + var_suffix;
1048 
1049       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1050       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1051       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1052       // Basis action
1053       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1054       switch (eval_mode) {
1055         case CEED_EVAL_NONE: {
1056           CeedInt             comp_stride;
1057           CeedElemRestriction elem_rstr;
1058 
1059           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1060           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
1061           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1062           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
1063           code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
1064                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
1065                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
1066           break;
1067         }
1068         case CEED_EVAL_INTERP:
1069           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1070           tab.push();
1071           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1072           tab.pop();
1073           code << tab << "}\n";
1074           code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1075                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1076           break;
1077         case CEED_EVAL_GRAD:
1078           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1079           tab.push();
1080           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1081           tab.pop();
1082           code << tab << "}\n";
1083           code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1084                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1085           break;
1086           // LCOV_EXCL_START
1087         case CEED_EVAL_WEIGHT:
1088           break;  // Should not occur
1089         case CEED_EVAL_DIV:
1090         case CEED_EVAL_CURL:
1091           break;  // TODO: Not implemented
1092                   // LCOV_EXCL_STOP
1093       }
1094     }
1095   } else if (use_3d_slices) {
1096     // Copy or apply transpose grad, if needed
1097     code << "\n";
1098     code << tab << "// -- Output fields\n";
1099     for (CeedInt i = 0; i < num_output_fields; i++) {
1100       const char *field_name;
1101       std::string var_suffix = "_out_" + std::to_string(i);
1102       std::string P_name     = "P_1d" + var_suffix;
1103 
1104       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1105       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1106       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1107       // Basis action
1108       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1109       switch (eval_mode) {
1110         case CEED_EVAL_NONE:
1111           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1112           tab.push();
1113           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1114           tab.pop();
1115           code << tab << "}\n";
1116           break;
1117         case CEED_EVAL_INTERP:
1118           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1119           tab.push();
1120           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1121           tab.pop();
1122           code << tab << "}\n";
1123           break;
1124         case CEED_EVAL_GRAD:
1125           code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1126                << var_suffix << ", r_q" << var_suffix << ");\n";
1127           break;
1128           // LCOV_EXCL_START
1129         case CEED_EVAL_WEIGHT:
1130           break;  // Should not occur
1131         case CEED_EVAL_DIV:
1132         case CEED_EVAL_CURL:
1133           break;  // TODO: Not implemented
1134                   // LCOV_EXCL_STOP
1135       }
1136     }
1137   }
1138   tab.pop();
1139   code << tab << "}\n";
1140   return CEED_ERROR_SUCCESS;
1141 }
1142 
1143 //------------------------------------------------------------------------------
1144 // Build single operator kernel
1145 //------------------------------------------------------------------------------
1146 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
1147   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
1148   Ceed                   ceed;
1149   CeedInt                Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1150   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1151   CeedQFunction_Hip_gen *qf_data;
1152   CeedQFunction          qf;
1153   CeedOperatorField     *op_input_fields, *op_output_fields;
1154   CeedOperator_Hip_gen  *data;
1155   std::ostringstream     code;
1156   Tab                    tab;
1157 
1158   CeedCallBackend(CeedOperatorGetData(op, &data));
1159   {
1160     bool is_setup_done;
1161 
1162     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
1163     if (is_setup_done) {
1164       *is_good_build = !data->use_fallback;
1165       return CEED_ERROR_SUCCESS;
1166     }
1167   }
1168 
1169   // Check field compatibility
1170   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1171   {
1172     bool has_shared_bases = true;
1173 
1174     for (CeedInt i = 0; i < num_input_fields; i++) {
1175       CeedBasis basis;
1176 
1177       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1178       if (basis != CEED_BASIS_NONE) {
1179         bool        is_tensor = true;
1180         const char *resource;
1181         char       *resource_root;
1182         Ceed        basis_ceed;
1183 
1184         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1185         is_all_tensor    = is_all_tensor && is_tensor;
1186         is_all_nontensor = is_all_nontensor && !is_tensor;
1187         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1188         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1189         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1190         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1191         CeedCallBackend(CeedFree(&resource_root));
1192         CeedCallBackend(CeedDestroy(&basis_ceed));
1193       }
1194       CeedCallBackend(CeedBasisDestroy(&basis));
1195     }
1196 
1197     for (CeedInt i = 0; i < num_output_fields; i++) {
1198       CeedBasis basis;
1199 
1200       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1201       if (basis != CEED_BASIS_NONE) {
1202         bool        is_tensor = true;
1203         const char *resource;
1204         char       *resource_root;
1205         Ceed        basis_ceed;
1206 
1207         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1208         is_all_tensor    = is_all_tensor && is_tensor;
1209         is_all_nontensor = is_all_nontensor && !is_tensor;
1210 
1211         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1212         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1213         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1214         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1215         CeedCallBackend(CeedFree(&resource_root));
1216         CeedCallBackend(CeedDestroy(&basis_ceed));
1217       }
1218       CeedCallBackend(CeedBasisDestroy(&basis));
1219     }
1220     // -- Fallback to ref if not all bases are shared
1221     if (!has_shared_bases) {
1222       *is_good_build = false;
1223       return CEED_ERROR_SUCCESS;
1224     }
1225   }
1226   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1227   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1228   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1229   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1230 
1231   // Get operator data
1232   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1233   {
1234     CeedInt max_P = 0, max_P_1d = 0;
1235 
1236     CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
1237                                                         qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));
1238     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
1239   }
1240   if (max_dim == 0) max_dim = 1;
1241   data->dim = max_dim;
1242   if (is_at_points) {
1243     CeedElemRestriction_Hip *rstr_data;
1244     CeedElemRestriction      rstr_points = NULL;
1245 
1246     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1247     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1248     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1249     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1250     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1251     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1252   }
1253   if (is_at_points) use_3d_slices = false;
1254   if (Q_1d == 0) {
1255     if (is_at_points) Q_1d = max_num_points;
1256     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1257   }
1258   if (Q == 0) Q = Q_1d;
1259   data->Q    = Q;
1260   data->Q_1d = Q_1d;
1261 
1262   // Check for restriction only identity operator
1263   {
1264     bool is_identity_qf;
1265 
1266     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1267     if (is_identity_qf) {
1268       CeedEvalMode eval_mode_in, eval_mode_out;
1269 
1270       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1271       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1272       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1273                 "Backend does not implement restriction only identity operators");
1274     }
1275   }
1276 
1277   // Load basis source files
1278   if (!is_all_nontensor) {
1279     code << tab << "// Tensor basis source\n";
1280     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1281   }
1282   if (!is_all_tensor) {
1283     code << tab << "// Non-tensor basis source\n";
1284     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
1285   }
1286   if (is_at_points) {
1287     code << tab << "// AtPoints basis source\n";
1288     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1289   }
1290   if (!is_all_tensor && !is_all_nontensor) {
1291     code << tab << "// Tensor basis source\n";
1292     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
1293   }
1294   code << tab << "// CodeGen operator source\n";
1295   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1296 
1297   // Get QFunction name
1298   std::string qfunction_name(qf_data->qfunction_name);
1299   std::string operator_name;
1300 
1301   operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
1302 
1303   // Define CEED_Q_VLA
1304   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1305   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1306     code << tab << "#define CEED_Q_VLA 1\n\n";
1307   } else {
1308     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1309   }
1310 
1311   // Add user QFunction source
1312   {
1313     const char *source_path;
1314 
1315     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1316     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1317 
1318     code << tab << "// User QFunction source\n";
1319     code << tab << "#include \"" << source_path << "\"\n\n";
1320   }
1321 
1322   // Setup
1323   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1324   code << tab << "// Operator Kernel\n";
1325   code << tab << "// \n";
1326   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1327   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1328   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1329   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1330   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1331   code << tab << "// \n";
1332   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1333   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1334   code << tab << "// -----------------------------------------------------------------------------\n";
1335   code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
1336   code << "__global__ void " << operator_name
1337        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
1338   tab.push();
1339 
1340   // Scratch buffers
1341   for (CeedInt i = 0; i < num_input_fields; i++) {
1342     CeedEvalMode eval_mode;
1343 
1344     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1345     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1346       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1347     }
1348   }
1349   for (CeedInt i = 0; i < num_output_fields; i++) {
1350     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1351   }
1352 
1353   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1354   if (!is_all_tensor) {
1355     code << tab << "const CeedInt Q = " << Q << ";\n";
1356   }
1357   if (!is_all_nontensor) {
1358     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1359   }
1360   if (is_at_points) {
1361     code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1362     code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1363   }
1364 
1365   // Shared data
1366   code << tab << "extern __shared__ CeedScalar slice[];\n";
1367   code << tab << "SharedData_Hip data;\n";
1368   code << tab << "data.t_id_x = threadIdx.x;\n";
1369   code << tab << "data.t_id_y = threadIdx.y;\n";
1370   code << tab << "data.t_id_z = threadIdx.z;\n";
1371   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1372   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1373 
1374   // -- Determine input mat reuse
1375   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1376 
1377   for (CeedInt i = 0; i < num_input_fields; i++) {
1378     input_matrix_reuse[i].index = -1;
1379   }
1380   for (CeedInt i = 0; i < num_input_fields; i++) {
1381     bool         is_tensor = true;
1382     CeedEvalMode eval_mode_i;
1383     CeedBasis    basis_i;
1384 
1385     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1386     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1387     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1388     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1389     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1390       CeedEvalMode eval_mode_j;
1391       CeedBasis    basis_j;
1392 
1393       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1394       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1395       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1396       if (basis_i == basis_j) {
1397         if (is_tensor) {
1398           input_matrix_reuse[i].index     = j;
1399           input_matrix_reuse[i].is_input  = true;
1400           input_matrix_reuse[i].eval_mode = eval_mode_j;
1401         } else {
1402           // For non-tensor can only re-use with the same eval mode
1403           if (eval_mode_i == eval_mode_j) {
1404             input_matrix_reuse[i].index     = j;
1405             input_matrix_reuse[i].is_input  = true;
1406             input_matrix_reuse[i].eval_mode = eval_mode_j;
1407           }
1408         }
1409       }
1410       CeedCallBackend(CeedBasisDestroy(&basis_j));
1411     }
1412     CeedCallBackend(CeedBasisDestroy(&basis_i));
1413   }
1414 
1415   // -- Determine output mat reuse
1416   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1417 
1418   for (CeedInt i = 0; i < num_output_fields; i++) {
1419     output_matrix_reuse[i].index = -1;
1420   }
1421   for (CeedInt i = 0; i < num_output_fields; i++) {
1422     bool         is_tensor = true;
1423     CeedEvalMode eval_mode_i;
1424     CeedBasis    basis_i;
1425 
1426     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1427     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1428     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1429       CeedEvalMode eval_mode_j;
1430       CeedBasis    basis_j;
1431 
1432       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1433       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1434       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1435       if (basis_i == basis_j) {
1436         if (is_tensor) {
1437           output_matrix_reuse[i].index     = j;
1438           output_matrix_reuse[i].is_input  = true;
1439           output_matrix_reuse[i].eval_mode = eval_mode_j;
1440         } else {
1441           // For non-tensor can only re-use with the same eval mode
1442           if (eval_mode_i == eval_mode_j) {
1443             output_matrix_reuse[i].index     = j;
1444             output_matrix_reuse[i].is_input  = true;
1445             output_matrix_reuse[i].eval_mode = eval_mode_j;
1446           }
1447         }
1448       }
1449       CeedCallBackend(CeedBasisDestroy(&basis_j));
1450     }
1451     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1452       CeedEvalMode eval_mode_j;
1453       CeedBasis    basis_j;
1454 
1455       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1456       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1457       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1458       CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1459       if (basis_i == basis_j) {
1460         if (is_tensor) {
1461           output_matrix_reuse[i].index     = j;
1462           output_matrix_reuse[i].is_input  = false;
1463           output_matrix_reuse[i].eval_mode = eval_mode_j;
1464         } else {
1465           // For non-tensor can only re-use with the same eval mode
1466           if (eval_mode_i == eval_mode_j) {
1467             output_matrix_reuse[i].index     = j;
1468             output_matrix_reuse[i].is_input  = false;
1469             output_matrix_reuse[i].eval_mode = eval_mode_j;
1470           }
1471         }
1472       }
1473       CeedCallBackend(CeedBasisDestroy(&basis_j));
1474     }
1475     CeedCallBackend(CeedBasisDestroy(&basis_i));
1476   }
1477 
1478   // Initialize constants, and matrices B and G
1479   code << "\n" << tab << "// Input field constants and basis data\n";
1480   for (CeedInt i = 0; i < num_input_fields; i++) {
1481     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1482                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
1483   }
1484   code << "\n" << tab << "// Output field constants and basis data\n";
1485   for (CeedInt i = 0; i < num_output_fields; i++) {
1486     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1487                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
1488   }
1489 
1490   // Loop over all elements
1491   code << "\n" << tab << "// Element loop\n";
1492   code << tab << "__syncthreads();\n";
1493   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1494   tab.push();
1495 
1496   // -- Compute minimum buffer space needed
1497   CeedInt max_rstr_buffer_size = 1;
1498 
1499   for (CeedInt i = 0; i < num_input_fields; i++) {
1500     CeedEvalMode eval_mode;
1501 
1502     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1503     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1504       CeedInt             num_comp;
1505       CeedElemRestriction elem_rstr;
1506 
1507       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1508       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1509       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1510       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1511     }
1512   }
1513   for (CeedInt i = 0; i < num_output_fields; i++) {
1514     CeedEvalMode eval_mode;
1515 
1516     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1517     if (eval_mode != CEED_EVAL_NONE) {
1518       CeedInt             num_comp;
1519       CeedElemRestriction elem_rstr;
1520 
1521       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1522       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1523       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1524       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1525     }
1526   }
1527   code << tab << "// Scratch restriction buffer space\n";
1528   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1529 
1530   // -- Determine best input field processing order
1531   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1532 
1533   for (CeedInt i = 0; i < num_input_fields; i++) {
1534     field_rstr_in_buffer[i] = -1;
1535     input_field_order[i]    = -1;
1536   }
1537   {
1538     bool    is_ordered[CEED_FIELD_MAX];
1539     CeedInt curr_index = 0;
1540 
1541     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1542     for (CeedInt i = 0; i < num_input_fields; i++) {
1543       CeedVector          vec_i;
1544       CeedElemRestriction rstr_i;
1545 
1546       if (is_ordered[i]) continue;
1547       field_rstr_in_buffer[i]       = i;
1548       is_ordered[i]                 = true;
1549       input_field_order[curr_index] = i;
1550       curr_index++;
1551       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1552       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1553       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1554       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1555         CeedVector          vec_j;
1556         CeedElemRestriction rstr_j;
1557 
1558         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1559         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1560         if (rstr_i == rstr_j && vec_i == vec_j) {
1561           field_rstr_in_buffer[j]       = i;
1562           is_ordered[j]                 = true;
1563           input_field_order[curr_index] = j;
1564           curr_index++;
1565         }
1566         CeedCallBackend(CeedVectorDestroy(&vec_j));
1567         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1568       }
1569       CeedCallBackend(CeedVectorDestroy(&vec_i));
1570       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1571     }
1572   }
1573 
1574   // -- Input restriction and basis
1575   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1576   for (CeedInt i = 0; i < num_input_fields; i++) {
1577     const char   *field_name;
1578     const CeedInt f = input_field_order[i];
1579 
1580     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1581     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1582 
1583     // ---- Restriction
1584     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1585                                                                max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1586 
1587     // ---- Basis action
1588     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1589                                                          is_all_tensor, is_at_points, use_3d_slices));
1590   }
1591 
1592   // -- Q function
1593   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1594                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1595                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
1596 
1597   // -- Output basis and restriction
1598   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
1599   for (CeedInt i = 0; i < num_output_fields; i++) {
1600     const char *field_name;
1601 
1602     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1603     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1604 
1605     // ---- Basis action
1606     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
1607                                                          is_all_tensor, is_at_points, use_3d_slices));
1608 
1609     // ---- Restriction
1610     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
1611                                                                false, is_all_tensor, is_at_points, use_3d_slices));
1612   }
1613 
1614   // Close loop and function
1615   tab.pop();
1616   code << tab << "}\n";
1617   tab.pop();
1618   code << tab << "}\n";
1619   code << tab << "// -----------------------------------------------------------------------------\n\n";
1620 
1621   CeedInt block_sizes[3] = {0, 0, 0};
1622   CeedInt num_elem;
1623 
1624   // Compile
1625   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
1626   CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
1627   {
1628     bool is_compile_good = false;
1629 
1630     data->thread_1d = block_sizes[0];
1631     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE",
1632                                        block_sizes[0] * block_sizes[1] * block_sizes[2]));
1633     if (is_compile_good) {
1634       *is_good_build = true;
1635       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
1636     } else {
1637       *is_good_build     = false;
1638       data->use_fallback = true;
1639     }
1640   }
1641   CeedCallBackend(CeedOperatorSetSetupDone(op));
1642   CeedCallBackend(CeedDestroy(&ceed));
1643   CeedCallBackend(CeedQFunctionDestroy(&qf));
1644   return CEED_ERROR_SUCCESS;
1645 }
1646 
1647 //------------------------------------------------------------------------------
1648 // Build AtPoints assembly operator kernel
1649 //------------------------------------------------------------------------------
1650 static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) {
1651   bool                   is_all_tensor = true, is_at_points = false, use_3d_slices = false;
1652   Ceed                   ceed;
1653   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1654   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1655   CeedQFunction_Hip_gen *qf_data;
1656   CeedQFunction          qf;
1657   CeedOperatorField     *op_input_fields, *op_output_fields;
1658   CeedOperator_Hip_gen  *data;
1659   std::ostringstream     code;
1660   Tab                    tab;
1661 
1662   // Check compatibility
1663   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1664   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1665   CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
1666 
1667   // Retrieve operator data
1668   CeedCallBackend(CeedOperatorGetData(op, &data));
1669   Q       = data->Q;
1670   Q_1d    = data->Q_1d;
1671   max_dim = data->dim;
1672   {
1673     CeedElemRestriction rstr_points = NULL;
1674 
1675     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1676     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1677     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1678     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1679   }
1680   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1681   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1682   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1683   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1684 
1685   // Load basis source files
1686   code << tab << "// Tensor basis source\n";
1687   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1688   code << tab << "// AtPoints basis source\n";
1689   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1690   code << tab << "// CodeGen operator source\n";
1691   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1692 
1693   // Get QFunction name
1694   std::string qfunction_name(qf_data->qfunction_name);
1695   std::string operator_name;
1696 
1697   if (is_full) {
1698     operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name;
1699   } else {
1700     operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name;
1701   }
1702 
1703   // Define CEED_Q_VLA
1704   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1705   code << tab << "#define CEED_Q_VLA 1\n\n";
1706 
1707   // Add user QFunction source
1708   {
1709     const char *source_path;
1710 
1711     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1712     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1713 
1714     code << tab << "// User QFunction source\n";
1715     code << tab << "#include \"" << source_path << "\"\n\n";
1716   }
1717 
1718   // Setup
1719   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1720   code << tab << "// Operator Assembly Kernel\n";
1721   code << tab << "// \n";
1722   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1723   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1724   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1725   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1726   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1727   code << tab << "// \n";
1728   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1729   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1730   code << tab << "// -----------------------------------------------------------------------------\n";
1731   code << tab << "extern \"C\" __global__ void " << operator_name
1732        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
1733           "points, CeedScalar *__restrict__ values_array) {\n";
1734   tab.push();
1735 
1736   // Scratch buffers
1737   for (CeedInt i = 0; i < num_input_fields; i++) {
1738     CeedEvalMode eval_mode;
1739 
1740     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1741     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1742       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1743     }
1744   }
1745   for (CeedInt i = 0; i < num_output_fields; i++) {
1746     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1747   }
1748 
1749   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1750   code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1751   code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1752   code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1753 
1754   // Shared data
1755   code << tab << "extern __shared__ CeedScalar slice[];\n";
1756   code << tab << "SharedData_Hip data;\n";
1757   code << tab << "data.t_id_x = threadIdx.x;\n";
1758   code << tab << "data.t_id_y = threadIdx.y;\n";
1759   code << tab << "data.t_id_z = threadIdx.z;\n";
1760   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1761   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1762 
1763   // -- Determine input mat reuse
1764   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1765 
1766   for (CeedInt i = 0; i < num_input_fields; i++) {
1767     input_matrix_reuse[i].index = -1;
1768   }
1769   for (CeedInt i = 0; i < num_input_fields; i++) {
1770     CeedEvalMode eval_mode_i;
1771     CeedBasis    basis_i;
1772 
1773     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1774     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1775     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1776     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1777       CeedEvalMode eval_mode_j;
1778       CeedBasis    basis_j;
1779 
1780       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1781       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1782       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1783       if (basis_i == basis_j) {
1784         input_matrix_reuse[i].index     = j;
1785         input_matrix_reuse[i].is_input  = true;
1786         input_matrix_reuse[i].eval_mode = eval_mode_j;
1787       }
1788       CeedCallBackend(CeedBasisDestroy(&basis_j));
1789     }
1790     CeedCallBackend(CeedBasisDestroy(&basis_i));
1791   }
1792 
1793   // -- Determine output mat reuse
1794   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1795 
1796   for (CeedInt i = 0; i < num_output_fields; i++) {
1797     output_matrix_reuse[i].index = -1;
1798   }
1799   for (CeedInt i = 0; i < num_output_fields; i++) {
1800     CeedEvalMode eval_mode_i;
1801     CeedBasis    basis_i;
1802 
1803     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1804     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1805     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1806       CeedEvalMode eval_mode_j;
1807       CeedBasis    basis_j;
1808 
1809       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1810       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1811       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1812       if (basis_i == basis_j) {
1813         output_matrix_reuse[i].index     = j;
1814         output_matrix_reuse[i].is_input  = true;
1815         output_matrix_reuse[i].eval_mode = eval_mode_j;
1816       }
1817       CeedCallBackend(CeedBasisDestroy(&basis_j));
1818     }
1819     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1820       CeedEvalMode eval_mode_j;
1821       CeedBasis    basis_j;
1822 
1823       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1824       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1825       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1826       if (basis_i == basis_j) {
1827         output_matrix_reuse[i].index     = j;
1828         output_matrix_reuse[i].is_input  = false;
1829         output_matrix_reuse[i].eval_mode = eval_mode_j;
1830       }
1831       CeedCallBackend(CeedBasisDestroy(&basis_j));
1832     }
1833     CeedCallBackend(CeedBasisDestroy(&basis_i));
1834   }
1835 
1836   // Initialize constants, and matrices B and G
1837   code << "\n" << tab << "// Input field constants and basis data\n";
1838   for (CeedInt i = 0; i < num_input_fields; i++) {
1839     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1840                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
1841   }
1842   code << "\n" << tab << "// Output field constants and basis data\n";
1843   for (CeedInt i = 0; i < num_output_fields; i++) {
1844     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1845                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
1846   }
1847 
1848   // Loop over all elements
1849   code << "\n" << tab << "// Element loop\n";
1850   code << tab << "__syncthreads();\n";
1851   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1852   tab.push();
1853 
1854   // -- Compute minimum buffer space needed
1855   CeedInt max_rstr_buffer_size = 1;
1856 
1857   for (CeedInt i = 0; i < num_input_fields; i++) {
1858     CeedEvalMode eval_mode;
1859 
1860     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1861     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1862       CeedInt             num_comp;
1863       CeedElemRestriction elem_rstr;
1864 
1865       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1866       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1867       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1868       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1869     }
1870   }
1871   for (CeedInt i = 0; i < num_output_fields; i++) {
1872     CeedEvalMode eval_mode;
1873 
1874     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1875     if (eval_mode != CEED_EVAL_NONE) {
1876       CeedInt             num_comp;
1877       CeedElemRestriction elem_rstr;
1878 
1879       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1880       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1881       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1882       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1883     }
1884   }
1885   code << tab << "// Scratch restriction buffer space\n";
1886   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1887 
1888   // -- Determine best input field processing order
1889   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1890 
1891   for (CeedInt i = 0; i < num_input_fields; i++) {
1892     field_rstr_in_buffer[i] = -1;
1893     input_field_order[i]    = -1;
1894   }
1895   {
1896     bool    is_ordered[CEED_FIELD_MAX];
1897     CeedInt curr_index = 0;
1898 
1899     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1900     for (CeedInt i = 0; i < num_input_fields; i++) {
1901       CeedVector          vec_i;
1902       CeedElemRestriction rstr_i;
1903 
1904       if (is_ordered[i]) continue;
1905       field_rstr_in_buffer[i]       = i;
1906       is_ordered[i]                 = true;
1907       input_field_order[curr_index] = i;
1908       curr_index++;
1909       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1910       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1911       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1912       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1913         CeedVector          vec_j;
1914         CeedElemRestriction rstr_j;
1915 
1916         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1917         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1918         if (rstr_i == rstr_j && vec_i == vec_j) {
1919           field_rstr_in_buffer[j]       = i;
1920           is_ordered[j]                 = true;
1921           input_field_order[curr_index] = j;
1922           curr_index++;
1923         }
1924         CeedCallBackend(CeedVectorDestroy(&vec_j));
1925         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1926       }
1927       CeedCallBackend(CeedVectorDestroy(&vec_i));
1928       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1929     }
1930   }
1931 
1932   // -- Input restriction and basis
1933   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1934   CeedInt active_field_index = -1;
1935 
1936   for (CeedInt i = 0; i < num_input_fields; i++) {
1937     bool          is_active = false;
1938     const char   *field_name;
1939     const CeedInt f = input_field_order[i];
1940 
1941     {
1942       CeedVector vec;
1943 
1944       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1945       is_active = vec == CEED_VECTOR_ACTIVE;
1946       CeedCallBackend(CeedVectorDestroy(&vec));
1947     }
1948 
1949     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1950     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1951 
1952     if (is_active) {
1953       std::string var_suffix = "_in_" + std::to_string(f);
1954 
1955       code << tab << "// Active field - no restriction or basis action here\n";
1956       if (active_field_index == -1) {
1957         active_field_index = f;
1958         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
1959              << "] = {0.0};\n";
1960       } else {
1961         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
1962       }
1963     } else {
1964       // ---- Restriction
1965       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1966                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1967 
1968       // ---- Basis action
1969       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1970                                                            is_all_tensor, is_at_points, use_3d_slices));
1971     }
1972   }
1973 
1974   // -- Loop over active field
1975   std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
1976 
1977   code << "\n" << tab << "// Loop over nodes in active field\n";
1978   code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
1979        << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
1980   tab.push();
1981 
1982   // -- Set current active node and component to 1
1983   code << tab << "// Set current active node and component to 1.0\n";
1984   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
1985        << active_var_suffix << ");\n\n";
1986 
1987   for (CeedInt i = 0; i < num_input_fields; i++) {
1988     bool          is_active = false;
1989     const char   *field_name;
1990     const CeedInt f = input_field_order[i];
1991 
1992     {
1993       CeedVector vec;
1994 
1995       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1996       is_active = vec == CEED_VECTOR_ACTIVE;
1997       CeedCallBackend(CeedVectorDestroy(&vec));
1998     }
1999     if (!is_active) continue;
2000 
2001     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
2002     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
2003 
2004     // ---- Basis action
2005     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
2006                                                          is_all_tensor, is_at_points, use_3d_slices));
2007   }
2008 
2009   // -- Q function
2010   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
2011                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
2012                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
2013 
2014   // -- Output basis and restriction
2015   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
2016   for (CeedInt i = 0; i < num_output_fields; i++) {
2017     bool        is_active = false;
2018     const char *field_name;
2019 
2020     {
2021       CeedVector vec;
2022 
2023       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2024       is_active = vec == CEED_VECTOR_ACTIVE;
2025       CeedCallBackend(CeedVectorDestroy(&vec));
2026     }
2027     if (!is_active) continue;
2028 
2029     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2030     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2031 
2032     // ---- Basis action
2033     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
2034                                                          is_all_tensor, is_at_points, use_3d_slices));
2035 
2036     // ---- Restriction
2037     if (is_full) {
2038       std::string         var_suffix = "_out_" + std::to_string(i);
2039       CeedInt             comp_stride;
2040       CeedSize            l_size;
2041       CeedElemRestriction elem_rstr;
2042 
2043       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2044       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2045       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2046       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2047       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2048       code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2049            << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
2050       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2051     } else {
2052       std::string         var_suffix = "_out_" + std::to_string(i);
2053       CeedInt             comp_stride;
2054       CeedSize            l_size;
2055       CeedElemRestriction elem_rstr;
2056 
2057       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2058       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2059       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2060       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2061       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2062       code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2063            << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
2064       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2065     }
2066   }
2067 
2068   // -- Reset current active node and component
2069   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2070   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
2071        << active_var_suffix << ");\n";
2072 
2073   // -- End of loop over active field
2074   tab.pop();
2075   code << tab << "}\n";
2076 
2077   // Close loop and function
2078   tab.pop();
2079   code << tab << "}\n";
2080   tab.pop();
2081   code << tab << "}\n";
2082   code << tab << "// -----------------------------------------------------------------------------\n\n";
2083 
2084   CeedInt block_sizes[3] = {0, 0, 0};
2085   CeedInt num_elem;
2086 
2087   // Compile
2088   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2089   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2090   {
2091     bool is_compile_good = false;
2092 
2093     data->thread_1d = block_sizes[0];
2094     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good,
2095                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0],
2096                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2097     if (is_compile_good) {
2098       *is_good_build = true;
2099       CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
2100                                         is_full ? &data->assemble_full : &data->assemble_diagonal));
2101     } else {
2102       *is_good_build              = false;
2103       data->use_assembly_fallback = true;
2104     }
2105   }
2106   CeedCallBackend(CeedDestroy(&ceed));
2107   CeedCallBackend(CeedQFunctionDestroy(&qf));
2108   return CEED_ERROR_SUCCESS;
2109 }
2110 
2111 extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2112   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build);
2113 }
2114 
2115 extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2116   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build);
2117 }
2118 //------------------------------------------------------------------------------
2119 // Build QFunction assembly operator kernel
2120 //------------------------------------------------------------------------------
2121 extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) {
2122   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
2123   Ceed                   ceed;
2124   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0;
2125   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
2126   CeedQFunction_Hip_gen *qf_data;
2127   CeedQFunction          qf;
2128   CeedOperatorField     *op_input_fields, *op_output_fields;
2129   CeedOperator_Hip_gen  *data;
2130   std::ostringstream     code;
2131   Tab                    tab;
2132 
2133   // Check compatibility
2134   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
2135   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
2136   CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported");
2137 
2138   // Check field compatibility
2139   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
2140   {
2141     bool has_shared_bases = true;
2142 
2143     for (CeedInt i = 0; i < num_input_fields; i++) {
2144       CeedBasis basis;
2145 
2146       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
2147       if (basis != CEED_BASIS_NONE) {
2148         bool        is_tensor = true;
2149         const char *resource;
2150         char       *resource_root;
2151         Ceed        basis_ceed;
2152 
2153         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2154         is_all_tensor    = is_all_tensor && is_tensor;
2155         is_all_nontensor = is_all_nontensor && !is_tensor;
2156         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2157         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2158         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2159         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2160         CeedCallBackend(CeedFree(&resource_root));
2161         CeedCallBackend(CeedDestroy(&basis_ceed));
2162       }
2163       CeedCallBackend(CeedBasisDestroy(&basis));
2164     }
2165 
2166     for (CeedInt i = 0; i < num_output_fields; i++) {
2167       CeedBasis basis;
2168 
2169       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
2170       if (basis != CEED_BASIS_NONE) {
2171         bool        is_tensor = true;
2172         const char *resource;
2173         char       *resource_root;
2174         Ceed        basis_ceed;
2175 
2176         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2177         is_all_tensor    = is_all_tensor && is_tensor;
2178         is_all_nontensor = is_all_nontensor && !is_tensor;
2179 
2180         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2181         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2182         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2183         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2184         CeedCallBackend(CeedFree(&resource_root));
2185         CeedCallBackend(CeedDestroy(&basis_ceed));
2186       }
2187       CeedCallBackend(CeedBasisDestroy(&basis));
2188     }
2189   }
2190 
2191   // Retrieve operator data
2192   CeedCallBackend(CeedOperatorGetData(op, &data));
2193   Q       = data->Q;
2194   Q_1d    = data->Q_1d;
2195   max_dim = data->dim;
2196   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
2197   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
2198   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
2199 
2200   // Load basis source files
2201   if (!is_all_nontensor) {
2202     code << tab << "// Tensor basis source\n";
2203     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
2204   }
2205   if (!is_all_tensor) {
2206     code << tab << "// Non-tensor basis source\n";
2207     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
2208   }
2209   if (!is_all_tensor && !is_all_nontensor) {
2210     code << "// Tensor basis source\n";
2211     code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
2212   }
2213   code << "// CodeGen operator source\n";
2214   code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
2215 
2216   // Get QFunction name
2217   std::string qfunction_name(qf_data->qfunction_name);
2218   std::string operator_name;
2219 
2220   operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name;
2221 
2222   // Define CEED_Q_VLA
2223   code << "\n" << tab << "#undef CEED_Q_VLA\n";
2224   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
2225     code << tab << "#define CEED_Q_VLA 1\n\n";
2226   } else {
2227     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
2228   }
2229 
2230   // Add user QFunction source
2231   {
2232     const char *source_path;
2233 
2234     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
2235     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
2236 
2237     code << tab << "// User QFunction source\n";
2238     code << tab << "#include \"" << source_path << "\"\n\n";
2239   }
2240 
2241   // Setup
2242   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
2243   code << tab << "// Operator Assembly Kernel\n";
2244   code << tab << "// \n";
2245   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
2246   code << tab << "// r_[in,out]_e_i: Element vector register\n";
2247   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
2248   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
2249   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
2250   code << tab << "// \n";
2251   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
2252   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
2253   code << tab << "// -----------------------------------------------------------------------------\n";
2254   code << tab << "extern \"C\" __global__ void " << operator_name
2255        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
2256           "points, CeedScalar *__restrict__ values_array) {\n";
2257   tab.push();
2258 
2259   // Scratch buffers
2260   for (CeedInt i = 0; i < num_input_fields; i++) {
2261     CeedEvalMode eval_mode;
2262 
2263     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2264     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
2265       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
2266     }
2267   }
2268   for (CeedInt i = 0; i < num_output_fields; i++) {
2269     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
2270   }
2271 
2272   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
2273   if (!is_all_tensor) {
2274     code << tab << "const CeedInt Q = " << Q << ";\n";
2275   }
2276   if (!is_all_nontensor) {
2277     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
2278   }
2279 
2280   // Shared data
2281   code << tab << "extern __shared__ CeedScalar slice[];\n";
2282   code << tab << "SharedData_Hip data;\n";
2283   code << tab << "data.t_id_x = threadIdx.x;\n";
2284   code << tab << "data.t_id_y = threadIdx.y;\n";
2285   code << tab << "data.t_id_z = threadIdx.z;\n";
2286   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
2287   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
2288 
2289   // -- Determine input mat reuse
2290   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
2291 
2292   for (CeedInt i = 0; i < num_input_fields; i++) {
2293     input_matrix_reuse[i].index = -1;
2294   }
2295   for (CeedInt i = 0; i < num_input_fields; i++) {
2296     bool         is_tensor = true;
2297     CeedEvalMode eval_mode_i;
2298     CeedBasis    basis_i;
2299 
2300     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
2301     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
2302     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
2303     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2304     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
2305       CeedEvalMode eval_mode_j;
2306       CeedBasis    basis_j;
2307 
2308       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2309       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2310       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2311       if (basis_i == basis_j) {
2312         if (is_tensor) {
2313           input_matrix_reuse[i].index     = j;
2314           input_matrix_reuse[i].is_input  = true;
2315           input_matrix_reuse[i].eval_mode = eval_mode_j;
2316         } else {
2317           // For non-tensor can only re-use with the same eval mode
2318           if (eval_mode_i == eval_mode_j) {
2319             input_matrix_reuse[i].index     = j;
2320             input_matrix_reuse[i].is_input  = true;
2321             input_matrix_reuse[i].eval_mode = eval_mode_j;
2322           }
2323         }
2324       }
2325       CeedCallBackend(CeedBasisDestroy(&basis_j));
2326     }
2327     CeedCallBackend(CeedBasisDestroy(&basis_i));
2328   }
2329 
2330   // -- Determine output mat reuse
2331   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
2332 
2333   for (CeedInt i = 0; i < num_output_fields; i++) {
2334     output_matrix_reuse[i].index = -1;
2335   }
2336   for (CeedInt i = 0; i < num_output_fields; i++) {
2337     bool         is_tensor = true;
2338     CeedEvalMode eval_mode_i;
2339     CeedBasis    basis_i;
2340 
2341     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
2342     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
2343     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2344     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
2345       CeedEvalMode eval_mode_j;
2346       CeedBasis    basis_j;
2347 
2348       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2349       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2350       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2351       if (basis_i == basis_j) {
2352         if (is_tensor) {
2353           output_matrix_reuse[i].index     = j;
2354           output_matrix_reuse[i].is_input  = true;
2355           output_matrix_reuse[i].eval_mode = eval_mode_j;
2356         } else {
2357           // For non-tensor can only re-use with the same eval mode
2358           if (eval_mode_i == eval_mode_j) {
2359             output_matrix_reuse[i].index     = j;
2360             output_matrix_reuse[i].is_input  = true;
2361             output_matrix_reuse[i].eval_mode = eval_mode_j;
2362           }
2363         }
2364       }
2365       CeedCallBackend(CeedBasisDestroy(&basis_j));
2366     }
2367     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
2368       CeedEvalMode eval_mode_j;
2369       CeedBasis    basis_j;
2370 
2371       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
2372       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2373       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
2374       if (basis_i == basis_j) {
2375         if (is_tensor) {
2376           output_matrix_reuse[i].index     = j;
2377           output_matrix_reuse[i].is_input  = false;
2378           output_matrix_reuse[i].eval_mode = eval_mode_j;
2379         } else {
2380           // For non-tensor can only re-use with the same eval mode
2381           if (eval_mode_i == eval_mode_j) {
2382             output_matrix_reuse[i].index     = j;
2383             output_matrix_reuse[i].is_input  = false;
2384             output_matrix_reuse[i].eval_mode = eval_mode_j;
2385           }
2386         }
2387       }
2388       CeedCallBackend(CeedBasisDestroy(&basis_j));
2389     }
2390     CeedCallBackend(CeedBasisDestroy(&basis_i));
2391   }
2392 
2393   // Initialize constants, and matrices B and G
2394   code << "\n" << tab << "// Input field constants and basis data\n";
2395   for (CeedInt i = 0; i < num_input_fields; i++) {
2396     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
2397                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true));
2398   }
2399   code << "\n" << tab << "// Output field constants and basis data\n";
2400   for (CeedInt i = 0; i < num_output_fields; i++) {
2401     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
2402                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true));
2403   }
2404 
2405   // Loop over all elements
2406   code << "\n" << tab << "// Element loop\n";
2407   code << tab << "__syncthreads();\n";
2408   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
2409   tab.push();
2410 
2411   // -- Compute minimum buffer space needed
2412   CeedInt max_rstr_buffer_size = 1;
2413 
2414   for (CeedInt i = 0; i < num_input_fields; i++) {
2415     CeedEvalMode eval_mode;
2416 
2417     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2418     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
2419       CeedInt             num_comp;
2420       CeedElemRestriction elem_rstr;
2421 
2422       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
2423       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2424       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2425       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2426     }
2427   }
2428   for (CeedInt i = 0; i < num_output_fields; i++) {
2429     CeedEvalMode eval_mode;
2430 
2431     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
2432     if (eval_mode != CEED_EVAL_NONE) {
2433       CeedInt             num_comp;
2434       CeedElemRestriction elem_rstr;
2435 
2436       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2437       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2438       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2439       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2440     }
2441   }
2442   code << tab << "// Scratch restriction buffer space\n";
2443   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
2444 
2445   // -- Determine best input field processing order
2446   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
2447 
2448   for (CeedInt i = 0; i < num_input_fields; i++) {
2449     field_rstr_in_buffer[i] = -1;
2450     input_field_order[i]    = -1;
2451   }
2452   {
2453     bool    is_ordered[CEED_FIELD_MAX];
2454     CeedInt curr_index = 0;
2455 
2456     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
2457     for (CeedInt i = 0; i < num_input_fields; i++) {
2458       CeedVector          vec_i;
2459       CeedElemRestriction rstr_i;
2460 
2461       if (is_ordered[i]) continue;
2462       field_rstr_in_buffer[i]       = i;
2463       is_ordered[i]                 = true;
2464       input_field_order[curr_index] = i;
2465       curr_index++;
2466       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
2467       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
2468       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
2469       for (CeedInt j = i + 1; j < num_input_fields; j++) {
2470         CeedVector          vec_j;
2471         CeedElemRestriction rstr_j;
2472 
2473         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
2474         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
2475         if (rstr_i == rstr_j && vec_i == vec_j) {
2476           field_rstr_in_buffer[j]       = i;
2477           is_ordered[j]                 = true;
2478           input_field_order[curr_index] = j;
2479           curr_index++;
2480         }
2481         CeedCallBackend(CeedVectorDestroy(&vec_j));
2482         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
2483       }
2484       CeedCallBackend(CeedVectorDestroy(&vec_i));
2485       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
2486     }
2487   }
2488 
2489   // -- Input restriction and basis
2490   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
2491   CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0;
2492   CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX];
2493 
2494   for (CeedInt i = 0; i < num_input_fields; i++) {
2495     bool          is_active = false;
2496     const char   *field_name;
2497     const CeedInt f = input_field_order[i];
2498 
2499     {
2500       CeedVector vec;
2501 
2502       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
2503       is_active = vec == CEED_VECTOR_ACTIVE;
2504       CeedCallBackend(CeedVectorDestroy(&vec));
2505     }
2506 
2507     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
2508     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
2509 
2510     if (is_active) {
2511       CeedEvalMode eval_mode;
2512       CeedInt      field_size;
2513 
2514       active_fields_in[num_active_in] = f;
2515       num_active_in++;
2516       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
2517       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
2518       if (eval_mode == CEED_EVAL_GRAD) {
2519         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
2520              << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2521       } else {
2522         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2523       }
2524       code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n";
2525     } else {
2526       // ---- Restriction
2527       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
2528                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
2529 
2530       // ---- Basis action
2531       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
2532                                                            is_all_tensor, is_at_points, use_3d_slices));
2533     }
2534   }
2535   code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {";
2536   for (CeedInt i = 0; i < num_active_in; i++) {
2537     code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2538   }
2539   code << "};\n";
2540   code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {";
2541   for (CeedInt i = 0; i < num_active_in; i++) {
2542     code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2543   }
2544   code << "};\n";
2545 
2546   for (CeedInt i = 0; i < num_output_fields; i++) {
2547     bool is_active = false;
2548 
2549     {
2550       CeedVector vec;
2551 
2552       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2553       is_active = vec == CEED_VECTOR_ACTIVE;
2554       CeedCallBackend(CeedVectorDestroy(&vec));
2555     }
2556     if (is_active) {
2557       const char *field_name;
2558       CeedInt     field_size;
2559 
2560       active_fields_out[num_active_out] = i;
2561       num_active_out++;
2562       CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2563       qf_assembly_size_out += field_size;
2564       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2565       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2566       code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n";
2567     }
2568   }
2569   code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {";
2570   for (CeedInt i = 0; i < num_active_out; i++) {
2571     code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : "");
2572   }
2573   code << "};\n";
2574   code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n";
2575 
2576   // -- Loop over active field
2577   code << "\n" << tab << "CeedInt input_offset = 0;\n";
2578   code << tab << "// Loop over active QFunction input fields\n";
2579   code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n";
2580   code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n";
2581   tab.push();
2582 
2583   // -- Loop over size of active field
2584   code << "\n" << tab << "// Loop over current active input field size\n";
2585   code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n";
2586   code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n";
2587   tab.push();
2588 
2589   // -- Set current active point and component to 1
2590   code << tab << "// Set current active point and component to 1.0\n";
2591   if (is_all_tensor && (max_dim >= 3)) {
2592     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n";
2593   } else {
2594     code << tab << "r_q_in[a][s] = 1.0;\n";
2595   }
2596 
2597   // -- Q function
2598   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
2599                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
2600                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
2601 
2602   // -- Output basis and restriction
2603   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
2604   CeedScalar offset = 0;
2605 
2606   for (CeedInt i = 0; i < num_output_fields; i++) {
2607     bool        is_active = false;
2608     const char *field_name;
2609 
2610     {
2611       CeedVector vec;
2612 
2613       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2614       is_active = vec == CEED_VECTOR_ACTIVE;
2615       CeedCallBackend(CeedVectorDestroy(&vec));
2616     }
2617     if (!is_active) continue;
2618 
2619     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2620     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2621 
2622     // ---- Restriction
2623     CeedInt field_size;
2624 
2625     code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
2626          << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
2627     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2628     offset += field_size;
2629   }
2630 
2631   // -- Reset current active node and component
2632   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2633   if (is_all_tensor && (max_dim >= 3)) {
2634     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n";
2635   } else {
2636     code << tab << "r_q_in[a][s] = 0.0;\n";
2637   }
2638 
2639   // -- End of loop over size of active field
2640   tab.pop();
2641   code << tab << "}\n";
2642   code << tab << "input_offset += field_size_in;\n";
2643 
2644   // -- End of loop over active field
2645   tab.pop();
2646   code << tab << "}\n";
2647 
2648   // Close loop and function
2649   tab.pop();
2650   code << tab << "}\n";
2651   tab.pop();
2652   code << tab << "}\n";
2653   code << tab << "// -----------------------------------------------------------------------------\n\n";
2654 
2655   CeedInt block_sizes[3] = {0, 0, 0};
2656   CeedInt num_elem;
2657 
2658   // Compile
2659   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2660   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2661   {
2662     bool is_compile_good = false;
2663 
2664     data->thread_1d = block_sizes[0];
2665     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0],
2666                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2667     if (is_compile_good) {
2668       *is_good_build = true;
2669       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
2670     } else {
2671       *is_good_build              = false;
2672       data->use_assembly_fallback = true;
2673     }
2674   }
2675   CeedCallBackend(CeedDestroy(&ceed));
2676   CeedCallBackend(CeedQFunctionDestroy(&qf));
2677   return CEED_ERROR_SUCCESS;
2678 }
2679 
2680 //------------------------------------------------------------------------------
2681