xref: /libCEED/rust/libceed-sys/c-src/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision e31b7a9f017acbb99fc9680ebe1cb8561925e8f7)
1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/gen-tools.h>
13 #include <ceed/jit-tools.h>
14 
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 
19 #include "../hip-ref/ceed-hip-ref.h"
20 #include "../hip-shared/ceed-hip-shared.h"
21 #include "../hip/ceed-hip-common.h"
22 #include "../hip/ceed-hip-compile.h"
23 #include "ceed-hip-gen.h"
24 
25 struct FieldReuse_Hip {
26   CeedInt      index;
27   bool         is_input;
28   CeedEvalMode eval_mode;
29 };
30 
31 //------------------------------------------------------------------------------
32 // Calculate the block size used for launching the operator kernel
33 //------------------------------------------------------------------------------
34 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
35   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
36   if (dim == 1) {
37     CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
38 
39     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
40     block_sizes[0]  = thread_1d;
41     block_sizes[1]  = 1;
42     block_sizes[2]  = elems_per_block;
43   } else if (dim == 2) {
44     const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2;
45 
46     block_sizes[0] = thread_1d;
47     block_sizes[1] = thread_1d;
48     block_sizes[2] = elems_per_block;
49   } else if (dim == 3) {
50     const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1);
51 
52     block_sizes[0] = thread_1d;
53     block_sizes[1] = thread_1d;
54     block_sizes[2] = elems_per_block;
55   }
56   return CEED_ERROR_SUCCESS;
57 }
58 
59 //------------------------------------------------------------------------------
60 // Determine type of operator
61 //------------------------------------------------------------------------------
62 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
63                                                CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
64                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
65                                                CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
66   // Check if all are tensor
67   *is_all_tensor = true;
68   for (CeedInt i = 0; i < num_input_fields; i++) {
69     CeedBasis basis;
70 
71     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
72     if (basis != CEED_BASIS_NONE) {
73       bool is_field_tensor;
74 
75       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
76       *is_all_tensor = *is_all_tensor && is_field_tensor;
77     }
78     CeedCallBackend(CeedBasisDestroy(&basis));
79   }
80   for (CeedInt i = 0; i < num_output_fields; i++) {
81     CeedBasis basis;
82 
83     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
84     if (basis != CEED_BASIS_NONE) {
85       bool is_field_tensor;
86 
87       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
88       *is_all_tensor = *is_all_tensor && is_field_tensor;
89     }
90     CeedCallBackend(CeedBasisDestroy(&basis));
91   }
92 
93   // Find max_P, max_P_1d, Q, and Q_1d
94   bool is_all_3d = true;
95 
96   *max_P    = 0;
97   *max_P_1d = 0;
98   *Q        = 0;
99   *Q_1d     = 0;
100   for (CeedInt i = 0; i < num_input_fields; i++) {
101     CeedBasis basis;
102 
103     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
104     if (basis != CEED_BASIS_NONE) {
105       bool    is_field_tensor;
106       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
107 
108       // Check if 3D
109       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
110       is_all_3d = is_all_3d && (field_dim == 3);
111       *max_dim  = CeedIntMax(*max_dim, field_dim);
112 
113       // Collect P, P_1d, Q, and Q_1d
114       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
115       *max_P = CeedIntMax(*max_P, field_P);
116       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
117       if (is_field_tensor) {
118         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
119         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
120       }
121       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
122       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
123       *Q = field_Q;
124       if (is_field_tensor) {
125         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
126         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
127         *Q_1d = field_Q_1d;
128       }
129     }
130     CeedCallBackend(CeedBasisDestroy(&basis));
131   }
132   for (CeedInt i = 0; i < num_output_fields; i++) {
133     CeedBasis basis;
134 
135     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
136     if (basis != CEED_BASIS_NONE) {
137       bool    is_field_tensor;
138       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
139 
140       // Check if 3D
141       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
142       is_all_3d = is_all_3d && (field_dim == 3);
143       *max_dim  = CeedIntMax(*max_dim, field_dim);
144 
145       // Collect P, P_1d, Q, and Q_1d
146       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
147       *max_P = CeedIntMax(*max_P, field_P);
148       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
149       if (is_field_tensor) {
150         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
151         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
152       }
153       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
154       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
155       *Q = field_Q;
156       if (is_field_tensor) {
157         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
158         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
159         *Q_1d = field_Q_1d;
160       }
161     }
162     CeedCallBackend(CeedBasisDestroy(&basis));
163   }
164 
165   // Only use 3D collocated gradient parallelization strategy when gradient is computed
166   *use_3d_slices = false;
167   if (is_all_3d && *is_all_tensor) {
168     bool was_grad_found = false;
169 
170     for (CeedInt i = 0; i < num_input_fields; i++) {
171       CeedEvalMode eval_mode;
172 
173       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
174       if (eval_mode == CEED_EVAL_GRAD) {
175         CeedBasis_Hip_shared *basis_data;
176         CeedBasis             basis;
177 
178         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
179         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
180         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
181         was_grad_found = true;
182         CeedCallBackend(CeedBasisDestroy(&basis));
183       }
184     }
185     for (CeedInt i = 0; i < num_output_fields; i++) {
186       CeedEvalMode eval_mode;
187 
188       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
189       if (eval_mode == CEED_EVAL_GRAD) {
190         CeedBasis_Hip_shared *basis_data;
191         CeedBasis             basis;
192 
193         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
194         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
195         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
196         was_grad_found = true;
197         CeedCallBackend(CeedBasisDestroy(&basis));
198       }
199     }
200   }
201   return CEED_ERROR_SUCCESS;
202 }
203 
204 //------------------------------------------------------------------------------
205 // Setup fields
206 //------------------------------------------------------------------------------
207 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
208                                                     CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
209                                                     CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
210                                                     bool use_3d_slices, bool skip_active_load) {
211   bool      is_tensor = true, is_active = true;
212   CeedBasis basis;
213 
214   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
215   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
216   {
217     CeedVector vec;
218 
219     CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec));
220     is_active = vec == CEED_VECTOR_ACTIVE;
221     CeedCallBackend(CeedVectorDestroy(&vec));
222   }
223 
224   const char           *field_name;
225   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
226   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
227   std::string           option_name = (is_input ? "inputs" : "outputs");
228   CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
229   CeedInt               elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
230   CeedElemRestriction   elem_rstr;
231   CeedBasis_Hip_shared *basis_data;
232 
233   // Field reuse info
234   bool use_previous_field = field_reuse.index != -1;
235 
236   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
237   code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
238 
239   // Get field data
240   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
241   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
242     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
243     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
244   }
245   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
246   if (basis != CEED_BASIS_NONE) {
247     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
248     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
249     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
250     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
251   }
252   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
253 
254   // Set field constants
255   code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
256   if (is_tensor && !is_all_tensor) {
257     CeedInt P = 0;
258 
259     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
260     code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
261   }
262   code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
263   if (eval_mode != CEED_EVAL_WEIGHT) {
264     code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
265   }
266 
267   // Load basis data
268   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
269   switch (eval_mode) {
270     case CEED_EVAL_NONE:
271       break;
272     case CEED_EVAL_INTERP:
273       if (is_at_points) {
274         // AtPoints
275         if (!basis_data->d_chebyshev_interp_1d) {
276           CeedSize    interp_bytes;
277           CeedScalar *chebyshev_interp_1d;
278 
279           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
280           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
281           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
282           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
283           CeedCallHip(CeedBasisReturnCeed(basis),
284                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
285           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
286         }
287         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
288         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
289       } else {
290         // Standard quadrature
291         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
292         else data->B.outputs[i] = basis_data->d_interp_1d;
293       }
294       if (use_previous_field && !skip_active_load) {
295         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
296 
297         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
298       } else {
299         bool is_collocated = false;
300 
301         CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
302         if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
303           code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
304         } else {
305           code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
306           code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
307         }
308       }
309       break;
310     case CEED_EVAL_GRAD:
311       if (is_at_points) {
312         // AtPoints
313         if (!basis_data->d_chebyshev_interp_1d) {
314           CeedSize    interp_bytes;
315           CeedScalar *chebyshev_interp_1d;
316 
317           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
318           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
319           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
320           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
321           CeedCallHip(CeedBasisReturnCeed(basis),
322                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
323           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
324         }
325         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
326         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
327       } else {
328         // Standard quadrature
329         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
330         else data->B.outputs[i] = basis_data->d_interp_1d;
331       }
332       if (is_tensor) {
333         if (use_previous_field && !skip_active_load) {
334           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
335 
336           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
337         } else {
338           bool is_collocated = false;
339 
340           CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
341           if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
342             code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
343           } else {
344             code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
345             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
346           }
347         }
348       }
349       if (is_at_points) break;  // No G mat for AtPoints
350       if (use_3d_slices) {
351         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
352         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
353         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
354           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
355 
356           code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
357         } else if (is_active && skip_active_load) {
358           code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
359         } else {
360           code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
361           code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
362         }
363       } else {
364         bool has_collo_grad = basis_data->d_collo_grad_1d;
365 
366         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
367         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
368         if (has_collo_grad) {
369           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
370             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
371 
372             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
373           } else if (is_active && skip_active_load) {
374             code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
375           } else {
376             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
377             code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
378           }
379         } else {
380           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
381             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
382 
383             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
384           } else if (is_active && skip_active_load) {
385             code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
386           } else {
387             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
388                  << (is_tensor ? "" : var_suffix) << "];\n";
389             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
390                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
391           }
392         }
393       }
394       break;
395     case CEED_EVAL_WEIGHT:
396       break;  // No action
397       // LCOV_EXCL_START
398     case CEED_EVAL_DIV:
399     case CEED_EVAL_CURL:
400       break;  // TODO: Not implemented
401               // LCOV_EXCL_STOP
402   }
403   CeedCallBackend(CeedBasisDestroy(&basis));
404   return CEED_ERROR_SUCCESS;
405 }
406 
407 //------------------------------------------------------------------------------
408 // Restriction
409 //------------------------------------------------------------------------------
410 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
411                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
412                                                       CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
413                                                       bool use_3d_slices) {
414   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
415   std::string              P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
416   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
417   CeedInt                  elem_size = 0, num_comp = 0;
418   CeedSize                 l_size;
419   CeedRestrictionType      rstr_type = CEED_RESTRICTION_STANDARD;
420   CeedElemRestriction_Hip *rstr_data;
421   CeedElemRestriction      elem_rstr;
422 
423   // Get field data
424   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
425   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
426     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
427     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
428     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
429     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
430   }
431   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
432 
433   // Restriction
434   if (is_input) {
435     // Input
436     if (field_input_buffer[i] != i) {
437       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
438 
439       // Restriction was already done for previous input
440       code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
441     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
442       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
443         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
444         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
445       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
446         // Otherwise we're using the scratch space
447         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
448       }
449       switch (rstr_type) {
450         case CEED_RESTRICTION_STANDARD: {
451           CeedInt comp_stride;
452 
453           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
454           code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
455           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
456           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
457           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
458           code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
459                << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
460                << ");\n";
461           break;
462         }
463         case CEED_RESTRICTION_STRIDED: {
464           bool    has_backend_strides;
465           CeedInt num_elem;
466 
467           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
468           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
469           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
470 
471           if (!has_backend_strides) {
472             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
473           }
474           code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
475                << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
476           code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
477                << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
478                << var_suffix << ");\n";
479           break;
480         }
481         case CEED_RESTRICTION_POINTS: {
482           CeedInt comp_stride;
483 
484           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
485           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
486           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
487           break;
488         }
489         // LCOV_EXCL_START
490         case CEED_RESTRICTION_ORIENTED:
491         case CEED_RESTRICTION_CURL_ORIENTED:
492           break;  // TODO: Not implemented
493                   // LCOV_EXCL_STOP
494       }
495     }
496   } else {
497     // Output
498     switch (rstr_type) {
499       case CEED_RESTRICTION_STANDARD: {
500         CeedInt comp_stride;
501 
502         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
503         code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
504         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
505         code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
506         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
507         code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
508              << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
509              << ");\n";
510         break;
511       }
512       case CEED_RESTRICTION_STRIDED: {
513         bool    has_backend_strides;
514         CeedInt num_elem;
515 
516         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
517         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
518         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
519 
520         if (!has_backend_strides) {
521           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
522         }
523         code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
524              << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
525         code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
526              << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
527              << ");\n";
528         break;
529       }
530       case CEED_RESTRICTION_POINTS:
531         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
532         break;
533       // LCOV_EXCL_START
534       case CEED_RESTRICTION_ORIENTED:
535       case CEED_RESTRICTION_CURL_ORIENTED:
536         break;  // TODO: Not implemented
537                 // LCOV_EXCL_STOP
538     }
539   }
540   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
541   return CEED_ERROR_SUCCESS;
542 }
543 
544 //------------------------------------------------------------------------------
545 // Basis
546 //------------------------------------------------------------------------------
547 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field,
548                                                 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
549                                                 bool is_at_points, bool use_3d_slices) {
550   bool      is_tensor = true, is_collocated = true;
551   CeedBasis basis;
552   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
553   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
554   CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
555 
556   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
557   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
558   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
559   CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
560   CeedElemRestriction elem_rstr;
561 
562   // Get field data
563   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
564   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
565     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
566     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
567   }
568   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
569   if (basis != CEED_BASIS_NONE) {
570     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
571     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
572     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
573   }
574   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
575 
576   // Basis
577   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
578   if (is_input) {
579     switch (eval_mode) {
580       case CEED_EVAL_NONE:
581         if (!use_3d_slices && !is_at_points) {
582           code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
583         }
584         break;
585       case CEED_EVAL_INTERP:
586         if (is_at_points) {
587           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
588 
589           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
590           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
591                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
592         } else {
593           std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
594                                                    std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
595                                                 : "InterpNonTensor";
596           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
597 
598           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
599           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
600                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
601         }
602         break;
603       case CEED_EVAL_GRAD:
604         if (is_at_points) {
605           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
606 
607           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
608           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
609                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
610         } else if (use_3d_slices) {
611           std::string function_name =
612               (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d";
613 
614           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
615           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
616                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
617         } else if (is_tensor) {
618           bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
619           std::string function_name =
620               (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
621               std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
622           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
623 
624           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
625                << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
626           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
627                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
628         } else {
629           std::string function_name = "GradNonTensor";
630 
631           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
632           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
633                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
634         }
635         break;
636       case CEED_EVAL_WEIGHT: {
637         if (is_at_points) {
638           code << tab << "// Nothing to do AtPoints\n";
639         } else {
640           CeedBasis_Hip_shared *basis_data;
641           std::string           function_name = is_tensor
642                                                     ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
643                                                     : "WeightNonTensor";
644 
645           code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
646           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
647           data->W = basis_data->d_q_weight_1d;
648           code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
649         }
650         break;
651       }
652       // LCOV_EXCL_START
653       case CEED_EVAL_DIV:
654       case CEED_EVAL_CURL:
655         break;  // TODO: Not implemented
656                 // LCOV_EXCL_STOP
657     }
658   } else {
659     switch (eval_mode) {
660       case CEED_EVAL_NONE:
661         code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
662         break;  // No action
663       case CEED_EVAL_INTERP:
664         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
665         if (is_at_points) {
666           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
667 
668           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
669                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
670         } else {
671           std::string function_name =
672               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
673                            std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
674                         : "InterpTransposeNonTensor";
675           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
676 
677           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
678                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
679         }
680         break;
681       case CEED_EVAL_GRAD:
682         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
683         if (is_at_points) {
684           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
685 
686           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
687                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
688         } else if (use_3d_slices) {
689           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
690                                       std::to_string(dim) + "d";
691 
692           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
693                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
694         } else if (is_tensor) {
695           bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
696           std::string function_name =
697               (dim == 1 ? "GradTranspose"
698                         : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
699               std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
700           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
701 
702           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
703                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
704         } else {
705           std::string function_name = "GradTransposeNonTensor";
706 
707           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
708                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
709         }
710         break;
711       // LCOV_EXCL_START
712       case CEED_EVAL_WEIGHT:
713         break;  // Should not occur
714       case CEED_EVAL_DIV:
715       case CEED_EVAL_CURL:
716         break;  // TODO: Not implemented
717                 // LCOV_EXCL_STOP
718     }
719   }
720   CeedCallBackend(CeedBasisDestroy(&basis));
721   return CEED_ERROR_SUCCESS;
722 }
723 
724 //------------------------------------------------------------------------------
725 // QFunction
726 //------------------------------------------------------------------------------
727 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim,
728                                                     CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
729                                                     CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
730                                                     CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
731                                                     std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
732                                                     bool use_3d_slices) {
733   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
734   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
735   CeedElemRestriction elem_rstr;
736 
737   // Setup output arrays
738   code << "\n";
739   code << tab << "// -- Output field setup\n";
740   for (CeedInt i = 0; i < num_output_fields; i++) {
741     const char *field_name;
742     std::string var_suffix = "_out_" + std::to_string(i);
743 
744     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
745     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
746     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
747     switch (eval_mode) {
748       case CEED_EVAL_NONE:
749         if (is_at_points) {
750           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
751         } else {
752           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
753                << "];\n";
754         }
755         break;
756       case CEED_EVAL_INTERP:
757         if (is_at_points) {
758           // Accumulator for point data
759           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
760           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
761                << "[i] = 0.0;\n";
762         } else {
763           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
764                << "];\n";
765         }
766         break;
767       case CEED_EVAL_GRAD:
768         if (is_at_points) {
769           // Accumulator for point data
770           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
771           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
772                << "[i] = 0.0;\n";
773         } else if (use_3d_slices) {
774           // Accumulator for gradient slices
775           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
776           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
777         } else {
778           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
779                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
780         }
781         break;
782       case CEED_EVAL_WEIGHT:
783         break;
784         // LCOV_EXCL_START
785       case CEED_EVAL_DIV:
786       case CEED_EVAL_CURL:
787         break;  // TODO: Not implemented
788                 // LCOV_EXCL_STOP
789     }
790   }
791 
792   if (is_at_points) {
793     // We need to handle batches of points
794     code << "\n";
795     code << tab << "// Note: Using batches of points\n";
796     code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
797     code << tab << "#pragma unroll\n";
798     code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
799     tab.push();
800     code << tab << "const CeedInt p = i % max_num_points;\n\n";
801 
802     code << tab << "// -- Coordinates\n";
803     code << tab << "CeedScalar r_x[max_dim];\n";
804     code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
805 
806     code << tab << "// -- Input fields\n";
807     for (CeedInt i = 0; i < num_input_fields; i++) {
808       const char *field_name;
809       std::string var_suffix = "_in_" + std::to_string(i);
810       std::string P_name     = "P_1d" + var_suffix;
811 
812       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
813       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
814       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
815       // Basis action
816       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
817       switch (eval_mode) {
818         case CEED_EVAL_NONE:
819           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
820           code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
821                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
822           break;
823         case CEED_EVAL_INTERP:
824           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
825           code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
826                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
827           break;
828         case CEED_EVAL_GRAD:
829           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
830           code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
831                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
832           break;
833         case CEED_EVAL_WEIGHT:
834           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
835           code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
836           break;
837           // LCOV_EXCL_START
838         case CEED_EVAL_DIV:
839         case CEED_EVAL_CURL:
840           break;  // TODO: Not implemented
841                   // LCOV_EXCL_STOP
842       }
843     }
844     code << "\n";
845     code << tab << "// -- Output fields\n";
846     for (CeedInt i = 0; i < num_output_fields; i++) {
847       const char *field_name;
848       std::string var_suffix = "_out_" + std::to_string(i);
849 
850       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
851       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
852       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
853       // Basis action
854       switch (eval_mode) {
855         case CEED_EVAL_NONE:
856           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
857           break;
858         case CEED_EVAL_INTERP:
859           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
860           break;
861         case CEED_EVAL_GRAD:
862           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
863           break;
864           // LCOV_EXCL_START
865         case CEED_EVAL_WEIGHT:
866           break;  // Should not occur
867         case CEED_EVAL_DIV:
868         case CEED_EVAL_CURL:
869           break;  // TODO: Not implemented
870                   // LCOV_EXCL_STOP
871       }
872     }
873 
874   } else if (use_3d_slices) {
875     // We treat quadrature points per slice in 3d to save registers
876     code << "\n";
877     code << tab << "// Note: Using planes of 3D elements\n";
878     code << tab << "#pragma unroll\n";
879     code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
880     tab.push();
881     code << tab << "// -- Input fields\n";
882     for (CeedInt i = 0; i < num_input_fields; i++) {
883       const char *field_name;
884       std::string var_suffix = "_in_" + std::to_string(i);
885 
886       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
887       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
888       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
889       // Basis action
890       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
891       switch (eval_mode) {
892         case CEED_EVAL_NONE:
893           bool is_strided;
894 
895           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
896 
897           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
898           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
899           if (is_strided) {
900             bool    has_backend_strides;
901             CeedInt num_elem, elem_size;
902 
903             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
904             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
905             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
906             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
907 
908             if (!has_backend_strides) {
909               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
910             }
911             code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
912                  << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
913             code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
914                  << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
915           } else {
916             CeedSize                 l_size = 0;
917             CeedInt                  comp_stride;
918             CeedElemRestriction_Hip *rstr_data;
919 
920             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
921             code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
922             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
923             code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
924             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
925             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
926             code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
927                  << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
928           }
929           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
930           break;
931         case CEED_EVAL_INTERP:
932           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
933           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
934           tab.push();
935           code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
936           tab.pop();
937           code << tab << "}\n";
938           break;
939         case CEED_EVAL_GRAD:
940           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
941           code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
942                << var_suffix << ", r_s" << var_suffix << ");\n";
943           break;
944         case CEED_EVAL_WEIGHT:
945           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
946           code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
947           break;
948           // LCOV_EXCL_START
949         case CEED_EVAL_DIV:
950         case CEED_EVAL_CURL:
951           break;  // TODO: Not implemented
952                   // LCOV_EXCL_STOP
953       }
954     }
955     code << "\n";
956     code << tab << "// -- Output fields\n";
957     for (CeedInt i = 0; i < num_output_fields; i++) {
958       const char *field_name;
959       std::string var_suffix = "_out_" + std::to_string(i);
960 
961       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
962       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
963       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
964       // Basis action
965       switch (eval_mode) {
966         case CEED_EVAL_NONE:
967           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
968           break;
969         case CEED_EVAL_INTERP:
970           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
971           break;
972         case CEED_EVAL_GRAD:
973           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
974           break;
975           // LCOV_EXCL_START
976         case CEED_EVAL_WEIGHT:
977           break;  // Should not occur
978         case CEED_EVAL_DIV:
979         case CEED_EVAL_CURL:
980           break;  // TODO: Not implemented
981                   // LCOV_EXCL_STOP
982       }
983     }
984   } else {
985     code << "\n";
986     code << tab << "// Note: Using full elements\n";
987     code << tab << "{\n";
988     tab.push();
989     code << tab << "// -- Input fields\n";
990     for (CeedInt i = 0; i < num_input_fields; i++) {
991       const char *field_name;
992 
993       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
994       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
995       code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
996     }
997     code << tab << "// -- Output fields\n";
998     for (CeedInt i = 0; i < num_output_fields; i++) {
999       const char *field_name;
1000 
1001       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1002       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1003       code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
1004     }
1005   }
1006 
1007   // Input and output buffers
1008   code << "\n";
1009   code << tab << "// -- QFunction inputs and outputs\n";
1010   code << tab << "// ---- Inputs\n";
1011   code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
1012   for (CeedInt i = 0; i < num_input_fields; i++) {
1013     const char *field_name;
1014 
1015     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
1016     code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
1017     code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
1018   }
1019   code << tab << "// ---- Outputs\n";
1020   code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
1021   for (CeedInt i = 0; i < num_output_fields; i++) {
1022     const char *field_name;
1023 
1024     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1025     code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
1026     code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
1027   }
1028 
1029   // Apply QFunction
1030   code << "\n";
1031   code << tab << "// -- Apply QFunction\n";
1032   code << tab << "" << qfunction_name << "(ctx, ";
1033   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1034     code << "1";
1035   } else {
1036     code << Q_name;
1037   }
1038   code << ", inputs, outputs);\n";
1039 
1040   if (is_at_points) {
1041     // Map back to coefficients
1042     code << "\n";
1043     code << tab << "// -- Output fields\n";
1044     for (CeedInt i = 0; i < num_output_fields; i++) {
1045       const char *field_name;
1046       std::string var_suffix = "_out_" + std::to_string(i);
1047       std::string P_name     = "P_1d" + var_suffix;
1048 
1049       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1050       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1051       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1052       // Basis action
1053       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1054       switch (eval_mode) {
1055         case CEED_EVAL_NONE: {
1056           CeedInt             comp_stride;
1057           CeedElemRestriction elem_rstr;
1058 
1059           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1060           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
1061           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1062           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
1063           code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
1064                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
1065                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
1066           break;
1067         }
1068         case CEED_EVAL_INTERP:
1069           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1070           tab.push();
1071           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1072           tab.pop();
1073           code << tab << "}\n";
1074           code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1075                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1076           break;
1077         case CEED_EVAL_GRAD:
1078           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1079           tab.push();
1080           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1081           tab.pop();
1082           code << tab << "}\n";
1083           code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1084                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1085           break;
1086           // LCOV_EXCL_START
1087         case CEED_EVAL_WEIGHT:
1088           break;  // Should not occur
1089         case CEED_EVAL_DIV:
1090         case CEED_EVAL_CURL:
1091           break;  // TODO: Not implemented
1092                   // LCOV_EXCL_STOP
1093       }
1094     }
1095   } else if (use_3d_slices) {
1096     // Copy or apply transpose grad, if needed
1097     code << "\n";
1098     code << tab << "// -- Output fields\n";
1099     for (CeedInt i = 0; i < num_output_fields; i++) {
1100       const char *field_name;
1101       std::string var_suffix = "_out_" + std::to_string(i);
1102       std::string P_name     = "P_1d" + var_suffix;
1103 
1104       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1105       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1106       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1107       // Basis action
1108       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1109       switch (eval_mode) {
1110         case CEED_EVAL_NONE:
1111           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1112           tab.push();
1113           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1114           tab.pop();
1115           code << tab << "}\n";
1116           break;
1117         case CEED_EVAL_INTERP:
1118           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1119           tab.push();
1120           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1121           tab.pop();
1122           code << tab << "}\n";
1123           break;
1124         case CEED_EVAL_GRAD:
1125           code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1126                << var_suffix << ", r_q" << var_suffix << ");\n";
1127           break;
1128           // LCOV_EXCL_START
1129         case CEED_EVAL_WEIGHT:
1130           break;  // Should not occur
1131         case CEED_EVAL_DIV:
1132         case CEED_EVAL_CURL:
1133           break;  // TODO: Not implemented
1134                   // LCOV_EXCL_STOP
1135       }
1136     }
1137   }
1138   tab.pop();
1139   code << tab << "}\n";
1140   return CEED_ERROR_SUCCESS;
1141 }
1142 
1143 //------------------------------------------------------------------------------
1144 // Build single operator kernel
1145 //------------------------------------------------------------------------------
1146 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
1147   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
1148   Ceed                   ceed;
1149   CeedInt                Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1150   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1151   CeedQFunction_Hip_gen *qf_data;
1152   CeedQFunction          qf;
1153   CeedOperatorField     *op_input_fields, *op_output_fields;
1154   CeedOperator_Hip_gen  *data;
1155   std::ostringstream     code;
1156   Tab                    tab;
1157 
1158   CeedCallBackend(CeedOperatorGetData(op, &data));
1159   {
1160     bool is_setup_done;
1161 
1162     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
1163     if (is_setup_done) {
1164       *is_good_build = !data->use_fallback;
1165       return CEED_ERROR_SUCCESS;
1166     }
1167   }
1168 
1169   // Check field compatibility
1170   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1171   {
1172     bool has_shared_bases = true;
1173 
1174     for (CeedInt i = 0; i < num_input_fields; i++) {
1175       CeedBasis basis;
1176 
1177       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1178       if (basis != CEED_BASIS_NONE) {
1179         bool        is_tensor = true;
1180         const char *resource;
1181         char       *resource_root;
1182         Ceed        basis_ceed;
1183 
1184         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1185         is_all_tensor    = is_all_tensor && is_tensor;
1186         is_all_nontensor = is_all_nontensor && !is_tensor;
1187         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1188         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1189         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1190         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1191         CeedCallBackend(CeedFree(&resource_root));
1192         CeedCallBackend(CeedDestroy(&basis_ceed));
1193       }
1194       CeedCallBackend(CeedBasisDestroy(&basis));
1195     }
1196 
1197     for (CeedInt i = 0; i < num_output_fields; i++) {
1198       CeedBasis basis;
1199 
1200       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1201       if (basis != CEED_BASIS_NONE) {
1202         bool        is_tensor = true;
1203         const char *resource;
1204         char       *resource_root;
1205         Ceed        basis_ceed;
1206 
1207         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1208         is_all_tensor    = is_all_tensor && is_tensor;
1209         is_all_nontensor = is_all_nontensor && !is_tensor;
1210 
1211         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1212         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1213         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1214         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1215         CeedCallBackend(CeedFree(&resource_root));
1216         CeedCallBackend(CeedDestroy(&basis_ceed));
1217       }
1218       CeedCallBackend(CeedBasisDestroy(&basis));
1219     }
1220     // -- Fallback to ref if not all bases are shared
1221     if (!has_shared_bases) {
1222       *is_good_build = false;
1223       return CEED_ERROR_SUCCESS;
1224     }
1225   }
1226   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1227   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1228   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1229   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1230 
1231   // Get operator data
1232   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1233   {
1234     CeedInt max_P = 0, max_P_1d = 0;
1235 
1236     CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
1237                                                         qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));
1238     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
1239   }
1240   if (is_at_points) {
1241     CeedInt                  coords_dim = 0;
1242     CeedElemRestriction_Hip *rstr_data;
1243     CeedElemRestriction      rstr_points = NULL;
1244 
1245     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1246     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1247     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1248     CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim));
1249     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1250     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1251     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1252     if (max_dim == 0) max_dim = coords_dim;
1253     if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim));
1254   }
1255   if (max_dim == 0) max_dim = 1;
1256   data->dim = max_dim;
1257   if (is_at_points) use_3d_slices = false;
1258   if (Q_1d == 0) {
1259     if (is_at_points) Q_1d = max_num_points;
1260     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1261   }
1262   if (Q == 0) Q = Q_1d;
1263   data->Q    = Q;
1264   data->Q_1d = Q_1d;
1265 
1266   // Check for restriction only identity operator
1267   {
1268     bool is_identity_qf;
1269 
1270     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1271     if (is_identity_qf) {
1272       CeedEvalMode eval_mode_in, eval_mode_out;
1273 
1274       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1275       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1276       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1277                 "Backend does not implement restriction only identity operators");
1278     }
1279   }
1280 
1281   // Load basis source files
1282   if (!is_all_nontensor) {
1283     code << tab << "// Tensor basis source\n";
1284     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1285   }
1286   if (!is_all_tensor) {
1287     code << tab << "// Non-tensor basis source\n";
1288     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
1289   }
1290   if (is_at_points) {
1291     code << tab << "// AtPoints basis source\n";
1292     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1293   }
1294   if (!is_all_tensor && !is_all_nontensor) {
1295     code << tab << "// Tensor basis source\n";
1296     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
1297   }
1298   code << tab << "// CodeGen operator source\n";
1299   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1300 
1301   // Get QFunction name
1302   std::string qfunction_name(qf_data->qfunction_name);
1303   std::string operator_name;
1304 
1305   operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
1306 
1307   // Define CEED_Q_VLA
1308   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1309   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1310     code << tab << "#define CEED_Q_VLA 1\n\n";
1311   } else {
1312     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1313   }
1314 
1315   // Add user QFunction source
1316   {
1317     const char *source_path;
1318 
1319     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1320     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1321 
1322     code << tab << "// User QFunction source\n";
1323     code << tab << "#include \"" << source_path << "\"\n\n";
1324   }
1325 
1326   // Setup
1327   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1328   code << tab << "// Operator Kernel\n";
1329   code << tab << "// \n";
1330   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1331   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1332   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1333   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1334   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1335   code << tab << "// \n";
1336   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1337   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1338   code << tab << "// -----------------------------------------------------------------------------\n";
1339   code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
1340   code << "__global__ void " << operator_name
1341        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
1342   tab.push();
1343 
1344   // Scratch buffers
1345   for (CeedInt i = 0; i < num_input_fields; i++) {
1346     CeedEvalMode eval_mode;
1347 
1348     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1349     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1350       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1351     }
1352   }
1353   for (CeedInt i = 0; i < num_output_fields; i++) {
1354     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1355   }
1356 
1357   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1358   if (!is_all_tensor) {
1359     code << tab << "const CeedInt Q = " << Q << ";\n";
1360   }
1361   if (!is_all_nontensor) {
1362     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1363   }
1364   if (is_at_points) {
1365     code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1366     code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1367   }
1368 
1369   // Shared data
1370   code << tab << "extern __shared__ CeedScalar slice[];\n";
1371   code << tab << "SharedData_Hip data;\n";
1372   code << tab << "data.t_id_x = threadIdx.x;\n";
1373   code << tab << "data.t_id_y = threadIdx.y;\n";
1374   code << tab << "data.t_id_z = threadIdx.z;\n";
1375   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1376   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1377 
1378   // -- Determine input mat reuse
1379   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1380 
1381   for (CeedInt i = 0; i < num_input_fields; i++) {
1382     input_matrix_reuse[i].index = -1;
1383   }
1384   for (CeedInt i = 0; i < num_input_fields; i++) {
1385     bool         is_tensor = true;
1386     CeedEvalMode eval_mode_i;
1387     CeedBasis    basis_i;
1388 
1389     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1390     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1391     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1392     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1393     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1394       CeedEvalMode eval_mode_j;
1395       CeedBasis    basis_j;
1396 
1397       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1398       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1399       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1400       if (basis_i == basis_j) {
1401         if (is_tensor) {
1402           input_matrix_reuse[i].index     = j;
1403           input_matrix_reuse[i].is_input  = true;
1404           input_matrix_reuse[i].eval_mode = eval_mode_j;
1405         } else {
1406           // For non-tensor can only re-use with the same eval mode
1407           if (eval_mode_i == eval_mode_j) {
1408             input_matrix_reuse[i].index     = j;
1409             input_matrix_reuse[i].is_input  = true;
1410             input_matrix_reuse[i].eval_mode = eval_mode_j;
1411           }
1412         }
1413       }
1414       CeedCallBackend(CeedBasisDestroy(&basis_j));
1415     }
1416     CeedCallBackend(CeedBasisDestroy(&basis_i));
1417   }
1418 
1419   // -- Determine output mat reuse
1420   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1421 
1422   for (CeedInt i = 0; i < num_output_fields; i++) {
1423     output_matrix_reuse[i].index = -1;
1424   }
1425   for (CeedInt i = 0; i < num_output_fields; i++) {
1426     bool         is_tensor = true;
1427     CeedEvalMode eval_mode_i;
1428     CeedBasis    basis_i;
1429 
1430     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1431     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1432     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1433       CeedEvalMode eval_mode_j;
1434       CeedBasis    basis_j;
1435 
1436       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1437       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1438       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1439       if (basis_i == basis_j) {
1440         if (is_tensor) {
1441           output_matrix_reuse[i].index     = j;
1442           output_matrix_reuse[i].is_input  = true;
1443           output_matrix_reuse[i].eval_mode = eval_mode_j;
1444         } else {
1445           // For non-tensor can only re-use with the same eval mode
1446           if (eval_mode_i == eval_mode_j) {
1447             output_matrix_reuse[i].index     = j;
1448             output_matrix_reuse[i].is_input  = true;
1449             output_matrix_reuse[i].eval_mode = eval_mode_j;
1450           }
1451         }
1452       }
1453       CeedCallBackend(CeedBasisDestroy(&basis_j));
1454     }
1455     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1456       CeedEvalMode eval_mode_j;
1457       CeedBasis    basis_j;
1458 
1459       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1460       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1461       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1462       CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1463       if (basis_i == basis_j) {
1464         if (is_tensor) {
1465           output_matrix_reuse[i].index     = j;
1466           output_matrix_reuse[i].is_input  = false;
1467           output_matrix_reuse[i].eval_mode = eval_mode_j;
1468         } else {
1469           // For non-tensor can only re-use with the same eval mode
1470           if (eval_mode_i == eval_mode_j) {
1471             output_matrix_reuse[i].index     = j;
1472             output_matrix_reuse[i].is_input  = false;
1473             output_matrix_reuse[i].eval_mode = eval_mode_j;
1474           }
1475         }
1476       }
1477       CeedCallBackend(CeedBasisDestroy(&basis_j));
1478     }
1479     CeedCallBackend(CeedBasisDestroy(&basis_i));
1480   }
1481 
1482   // Initialize constants, and matrices B and G
1483   code << "\n" << tab << "// Input field constants and basis data\n";
1484   for (CeedInt i = 0; i < num_input_fields; i++) {
1485     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1486                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
1487   }
1488   code << "\n" << tab << "// Output field constants and basis data\n";
1489   for (CeedInt i = 0; i < num_output_fields; i++) {
1490     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1491                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
1492   }
1493 
1494   // Loop over all elements
1495   code << "\n" << tab << "// Element loop\n";
1496   code << tab << "__syncthreads();\n";
1497   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1498   tab.push();
1499 
1500   // -- Compute minimum buffer space needed
1501   CeedInt max_rstr_buffer_size = 1;
1502 
1503   for (CeedInt i = 0; i < num_input_fields; i++) {
1504     CeedEvalMode eval_mode;
1505 
1506     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1507     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1508       CeedInt             num_comp;
1509       CeedElemRestriction elem_rstr;
1510 
1511       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1512       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1513       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1514       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1515     }
1516   }
1517   for (CeedInt i = 0; i < num_output_fields; i++) {
1518     CeedEvalMode eval_mode;
1519 
1520     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1521     if (eval_mode != CEED_EVAL_NONE) {
1522       CeedInt             num_comp;
1523       CeedElemRestriction elem_rstr;
1524 
1525       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1526       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1527       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1528       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1529     }
1530   }
1531   code << tab << "// Scratch restriction buffer space\n";
1532   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1533 
1534   // -- Determine best input field processing order
1535   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1536 
1537   for (CeedInt i = 0; i < num_input_fields; i++) {
1538     field_rstr_in_buffer[i] = -1;
1539     input_field_order[i]    = -1;
1540   }
1541   {
1542     bool    is_ordered[CEED_FIELD_MAX];
1543     CeedInt curr_index = 0;
1544 
1545     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1546     for (CeedInt i = 0; i < num_input_fields; i++) {
1547       CeedVector          vec_i;
1548       CeedElemRestriction rstr_i;
1549 
1550       if (is_ordered[i]) continue;
1551       field_rstr_in_buffer[i]       = i;
1552       is_ordered[i]                 = true;
1553       input_field_order[curr_index] = i;
1554       curr_index++;
1555       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1556       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1557       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1558       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1559         CeedVector          vec_j;
1560         CeedElemRestriction rstr_j;
1561 
1562         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1563         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1564         if (rstr_i == rstr_j && vec_i == vec_j) {
1565           field_rstr_in_buffer[j]       = i;
1566           is_ordered[j]                 = true;
1567           input_field_order[curr_index] = j;
1568           curr_index++;
1569         }
1570         CeedCallBackend(CeedVectorDestroy(&vec_j));
1571         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1572       }
1573       CeedCallBackend(CeedVectorDestroy(&vec_i));
1574       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1575     }
1576   }
1577 
1578   // -- Input restriction and basis
1579   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1580   for (CeedInt i = 0; i < num_input_fields; i++) {
1581     const char   *field_name;
1582     const CeedInt f = input_field_order[i];
1583 
1584     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1585     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1586 
1587     // ---- Restriction
1588     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1589                                                                max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1590 
1591     // ---- Basis action
1592     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1593                                                          is_all_tensor, is_at_points, use_3d_slices));
1594   }
1595 
1596   // -- Q function
1597   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1598                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1599                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
1600 
1601   // -- Output basis and restriction
1602   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
1603   for (CeedInt i = 0; i < num_output_fields; i++) {
1604     const char *field_name;
1605 
1606     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1607     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1608 
1609     // ---- Basis action
1610     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
1611                                                          is_all_tensor, is_at_points, use_3d_slices));
1612 
1613     // ---- Restriction
1614     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
1615                                                                false, is_all_tensor, is_at_points, use_3d_slices));
1616   }
1617 
1618   // Close loop and function
1619   tab.pop();
1620   code << tab << "}\n";
1621   tab.pop();
1622   code << tab << "}\n";
1623   code << tab << "// -----------------------------------------------------------------------------\n\n";
1624 
1625   CeedInt block_sizes[3] = {0, 0, 0};
1626   CeedInt num_elem;
1627 
1628   // Compile
1629   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
1630   CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
1631   {
1632     bool is_compile_good = false;
1633 
1634     data->thread_1d = block_sizes[0];
1635     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE",
1636                                        block_sizes[0] * block_sizes[1] * block_sizes[2]));
1637     if (is_compile_good) {
1638       *is_good_build = true;
1639       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
1640     } else {
1641       *is_good_build     = false;
1642       data->use_fallback = true;
1643     }
1644   }
1645   CeedCallBackend(CeedOperatorSetSetupDone(op));
1646   CeedCallBackend(CeedDestroy(&ceed));
1647   CeedCallBackend(CeedQFunctionDestroy(&qf));
1648   return CEED_ERROR_SUCCESS;
1649 }
1650 
1651 //------------------------------------------------------------------------------
1652 // Build AtPoints assembly operator kernel
1653 //------------------------------------------------------------------------------
1654 static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) {
1655   bool                   is_all_tensor = true, is_at_points = false, use_3d_slices = false;
1656   Ceed                   ceed;
1657   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1658   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1659   CeedQFunction_Hip_gen *qf_data;
1660   CeedQFunction          qf;
1661   CeedOperatorField     *op_input_fields, *op_output_fields;
1662   CeedOperator_Hip_gen  *data;
1663   std::ostringstream     code;
1664   Tab                    tab;
1665 
1666   // Check compatibility
1667   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1668   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1669   CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
1670 
1671   // Retrieve operator data
1672   CeedCallBackend(CeedOperatorGetData(op, &data));
1673   Q       = data->Q;
1674   Q_1d    = data->Q_1d;
1675   max_dim = data->dim;
1676   {
1677     CeedElemRestriction rstr_points = NULL;
1678 
1679     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1680     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1681     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1682     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1683   }
1684   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1685   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1686   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1687   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1688 
1689   // Load basis source files
1690   code << tab << "// Tensor basis source\n";
1691   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1692   code << tab << "// AtPoints basis source\n";
1693   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1694   code << tab << "// CodeGen operator source\n";
1695   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1696 
1697   // Get QFunction name
1698   std::string qfunction_name(qf_data->qfunction_name);
1699   std::string operator_name;
1700 
1701   if (is_full) {
1702     operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name;
1703   } else {
1704     operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name;
1705   }
1706 
1707   // Define CEED_Q_VLA
1708   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1709   code << tab << "#define CEED_Q_VLA 1\n\n";
1710 
1711   // Add user QFunction source
1712   {
1713     const char *source_path;
1714 
1715     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1716     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1717 
1718     code << tab << "// User QFunction source\n";
1719     code << tab << "#include \"" << source_path << "\"\n\n";
1720   }
1721 
1722   // Setup
1723   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1724   code << tab << "// Operator Assembly Kernel\n";
1725   code << tab << "// \n";
1726   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1727   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1728   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1729   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1730   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1731   code << tab << "// \n";
1732   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1733   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1734   code << tab << "// -----------------------------------------------------------------------------\n";
1735   code << tab << "extern \"C\" __global__ void " << operator_name
1736        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
1737           "points, CeedScalar *__restrict__ values_array) {\n";
1738   tab.push();
1739 
1740   // Scratch buffers
1741   for (CeedInt i = 0; i < num_input_fields; i++) {
1742     CeedEvalMode eval_mode;
1743 
1744     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1745     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1746       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1747     }
1748   }
1749   for (CeedInt i = 0; i < num_output_fields; i++) {
1750     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1751   }
1752 
1753   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1754   code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1755   code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1756   code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1757 
1758   // Shared data
1759   code << tab << "extern __shared__ CeedScalar slice[];\n";
1760   code << tab << "SharedData_Hip data;\n";
1761   code << tab << "data.t_id_x = threadIdx.x;\n";
1762   code << tab << "data.t_id_y = threadIdx.y;\n";
1763   code << tab << "data.t_id_z = threadIdx.z;\n";
1764   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1765   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1766 
1767   // -- Determine input mat reuse
1768   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1769 
1770   for (CeedInt i = 0; i < num_input_fields; i++) {
1771     input_matrix_reuse[i].index = -1;
1772   }
1773   for (CeedInt i = 0; i < num_input_fields; i++) {
1774     CeedEvalMode eval_mode_i;
1775     CeedBasis    basis_i;
1776 
1777     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1778     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1779     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1780     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1781       CeedEvalMode eval_mode_j;
1782       CeedBasis    basis_j;
1783 
1784       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1785       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1786       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1787       if (basis_i == basis_j) {
1788         input_matrix_reuse[i].index     = j;
1789         input_matrix_reuse[i].is_input  = true;
1790         input_matrix_reuse[i].eval_mode = eval_mode_j;
1791       }
1792       CeedCallBackend(CeedBasisDestroy(&basis_j));
1793     }
1794     CeedCallBackend(CeedBasisDestroy(&basis_i));
1795   }
1796 
1797   // -- Determine output mat reuse
1798   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1799 
1800   for (CeedInt i = 0; i < num_output_fields; i++) {
1801     output_matrix_reuse[i].index = -1;
1802   }
1803   for (CeedInt i = 0; i < num_output_fields; i++) {
1804     CeedEvalMode eval_mode_i;
1805     CeedBasis    basis_i;
1806 
1807     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1808     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1809     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1810       CeedEvalMode eval_mode_j;
1811       CeedBasis    basis_j;
1812 
1813       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1814       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1815       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1816       if (basis_i == basis_j) {
1817         output_matrix_reuse[i].index     = j;
1818         output_matrix_reuse[i].is_input  = true;
1819         output_matrix_reuse[i].eval_mode = eval_mode_j;
1820       }
1821       CeedCallBackend(CeedBasisDestroy(&basis_j));
1822     }
1823     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1824       CeedEvalMode eval_mode_j;
1825       CeedBasis    basis_j;
1826 
1827       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1828       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1829       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1830       if (basis_i == basis_j) {
1831         output_matrix_reuse[i].index     = j;
1832         output_matrix_reuse[i].is_input  = false;
1833         output_matrix_reuse[i].eval_mode = eval_mode_j;
1834       }
1835       CeedCallBackend(CeedBasisDestroy(&basis_j));
1836     }
1837     CeedCallBackend(CeedBasisDestroy(&basis_i));
1838   }
1839 
1840   // Initialize constants, and matrices B and G
1841   code << "\n" << tab << "// Input field constants and basis data\n";
1842   for (CeedInt i = 0; i < num_input_fields; i++) {
1843     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1844                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
1845   }
1846   code << "\n" << tab << "// Output field constants and basis data\n";
1847   for (CeedInt i = 0; i < num_output_fields; i++) {
1848     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1849                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
1850   }
1851 
1852   // Loop over all elements
1853   code << "\n" << tab << "// Element loop\n";
1854   code << tab << "__syncthreads();\n";
1855   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1856   tab.push();
1857 
1858   // -- Compute minimum buffer space needed
1859   CeedInt max_rstr_buffer_size = 1;
1860 
1861   for (CeedInt i = 0; i < num_input_fields; i++) {
1862     CeedEvalMode eval_mode;
1863 
1864     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1865     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1866       CeedInt             num_comp;
1867       CeedElemRestriction elem_rstr;
1868 
1869       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1870       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1871       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1872       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1873     }
1874   }
1875   for (CeedInt i = 0; i < num_output_fields; i++) {
1876     CeedEvalMode eval_mode;
1877 
1878     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1879     if (eval_mode != CEED_EVAL_NONE) {
1880       CeedInt             num_comp;
1881       CeedElemRestriction elem_rstr;
1882 
1883       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1884       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1885       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1886       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1887     }
1888   }
1889   code << tab << "// Scratch restriction buffer space\n";
1890   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1891 
1892   // -- Determine best input field processing order
1893   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1894 
1895   for (CeedInt i = 0; i < num_input_fields; i++) {
1896     field_rstr_in_buffer[i] = -1;
1897     input_field_order[i]    = -1;
1898   }
1899   {
1900     bool    is_ordered[CEED_FIELD_MAX];
1901     CeedInt curr_index = 0;
1902 
1903     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1904     for (CeedInt i = 0; i < num_input_fields; i++) {
1905       CeedVector          vec_i;
1906       CeedElemRestriction rstr_i;
1907 
1908       if (is_ordered[i]) continue;
1909       field_rstr_in_buffer[i]       = i;
1910       is_ordered[i]                 = true;
1911       input_field_order[curr_index] = i;
1912       curr_index++;
1913       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1914       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1915       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1916       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1917         CeedVector          vec_j;
1918         CeedElemRestriction rstr_j;
1919 
1920         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1921         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1922         if (rstr_i == rstr_j && vec_i == vec_j) {
1923           field_rstr_in_buffer[j]       = i;
1924           is_ordered[j]                 = true;
1925           input_field_order[curr_index] = j;
1926           curr_index++;
1927         }
1928         CeedCallBackend(CeedVectorDestroy(&vec_j));
1929         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1930       }
1931       CeedCallBackend(CeedVectorDestroy(&vec_i));
1932       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1933     }
1934   }
1935 
1936   // -- Input restriction and basis
1937   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1938   CeedInt active_field_index = -1;
1939 
1940   for (CeedInt i = 0; i < num_input_fields; i++) {
1941     bool          is_active = false;
1942     const char   *field_name;
1943     const CeedInt f = input_field_order[i];
1944 
1945     {
1946       CeedVector vec;
1947 
1948       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1949       is_active = vec == CEED_VECTOR_ACTIVE;
1950       CeedCallBackend(CeedVectorDestroy(&vec));
1951     }
1952 
1953     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1954     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1955 
1956     if (is_active) {
1957       std::string var_suffix = "_in_" + std::to_string(f);
1958 
1959       code << tab << "// Active field - no restriction or basis action here\n";
1960       if (active_field_index == -1) {
1961         active_field_index = f;
1962         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
1963              << "] = {0.0};\n";
1964       } else {
1965         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
1966       }
1967     } else {
1968       // ---- Restriction
1969       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1970                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1971 
1972       // ---- Basis action
1973       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1974                                                            is_all_tensor, is_at_points, use_3d_slices));
1975     }
1976   }
1977 
1978   // -- Loop over active field
1979   std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
1980 
1981   code << "\n" << tab << "// Loop over nodes in active field\n";
1982   code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
1983        << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
1984   tab.push();
1985 
1986   // -- Set current active node and component to 1
1987   code << tab << "// Set current active node and component to 1.0\n";
1988   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
1989        << active_var_suffix << ");\n\n";
1990 
1991   for (CeedInt i = 0; i < num_input_fields; i++) {
1992     bool          is_active = false;
1993     const char   *field_name;
1994     const CeedInt f = input_field_order[i];
1995 
1996     {
1997       CeedVector vec;
1998 
1999       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
2000       is_active = vec == CEED_VECTOR_ACTIVE;
2001       CeedCallBackend(CeedVectorDestroy(&vec));
2002     }
2003     if (!is_active) continue;
2004 
2005     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
2006     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
2007 
2008     // ---- Basis action
2009     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
2010                                                          is_all_tensor, is_at_points, use_3d_slices));
2011   }
2012 
2013   // -- Q function
2014   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
2015                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
2016                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
2017 
2018   // -- Output basis and restriction
2019   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
2020   for (CeedInt i = 0; i < num_output_fields; i++) {
2021     bool        is_active = false;
2022     const char *field_name;
2023 
2024     {
2025       CeedVector vec;
2026 
2027       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2028       is_active = vec == CEED_VECTOR_ACTIVE;
2029       CeedCallBackend(CeedVectorDestroy(&vec));
2030     }
2031     if (!is_active) continue;
2032 
2033     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2034     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2035 
2036     // ---- Basis action
2037     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
2038                                                          is_all_tensor, is_at_points, use_3d_slices));
2039 
2040     // ---- Restriction
2041     if (is_full) {
2042       std::string         var_suffix = "_out_" + std::to_string(i);
2043       CeedInt             comp_stride;
2044       CeedSize            l_size;
2045       CeedElemRestriction elem_rstr;
2046 
2047       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2048       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2049       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2050       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2051       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2052       code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2053            << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
2054       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2055     } else {
2056       std::string         var_suffix = "_out_" + std::to_string(i);
2057       CeedInt             comp_stride;
2058       CeedSize            l_size;
2059       CeedElemRestriction elem_rstr;
2060 
2061       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2062       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2063       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2064       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2065       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2066       code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2067            << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
2068       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2069     }
2070   }
2071 
2072   // -- Reset current active node and component
2073   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2074   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
2075        << active_var_suffix << ");\n";
2076 
2077   // -- End of loop over active field
2078   tab.pop();
2079   code << tab << "}\n";
2080 
2081   // Close loop and function
2082   tab.pop();
2083   code << tab << "}\n";
2084   tab.pop();
2085   code << tab << "}\n";
2086   code << tab << "// -----------------------------------------------------------------------------\n\n";
2087 
2088   CeedInt block_sizes[3] = {0, 0, 0};
2089   CeedInt num_elem;
2090 
2091   // Compile
2092   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2093   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2094   {
2095     bool is_compile_good = false;
2096 
2097     data->thread_1d = block_sizes[0];
2098     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good,
2099                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0],
2100                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2101     if (is_compile_good) {
2102       *is_good_build = true;
2103       CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
2104                                         is_full ? &data->assemble_full : &data->assemble_diagonal));
2105     } else {
2106       *is_good_build              = false;
2107       data->use_assembly_fallback = true;
2108     }
2109   }
2110   CeedCallBackend(CeedDestroy(&ceed));
2111   CeedCallBackend(CeedQFunctionDestroy(&qf));
2112   return CEED_ERROR_SUCCESS;
2113 }
2114 
2115 extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2116   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build);
2117 }
2118 
2119 extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2120   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build);
2121 }
2122 //------------------------------------------------------------------------------
2123 // Build QFunction assembly operator kernel
2124 //------------------------------------------------------------------------------
2125 extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) {
2126   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
2127   Ceed                   ceed;
2128   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0;
2129   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
2130   CeedQFunction_Hip_gen *qf_data;
2131   CeedQFunction          qf;
2132   CeedOperatorField     *op_input_fields, *op_output_fields;
2133   CeedOperator_Hip_gen  *data;
2134   std::ostringstream     code;
2135   Tab                    tab;
2136 
2137   // Check compatibility
2138   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
2139   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
2140   CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported");
2141 
2142   // Check field compatibility
2143   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
2144   {
2145     bool has_shared_bases = true;
2146 
2147     for (CeedInt i = 0; i < num_input_fields; i++) {
2148       CeedBasis basis;
2149 
2150       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
2151       if (basis != CEED_BASIS_NONE) {
2152         bool        is_tensor = true;
2153         const char *resource;
2154         char       *resource_root;
2155         Ceed        basis_ceed;
2156 
2157         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2158         is_all_tensor    = is_all_tensor && is_tensor;
2159         is_all_nontensor = is_all_nontensor && !is_tensor;
2160         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2161         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2162         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2163         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2164         CeedCallBackend(CeedFree(&resource_root));
2165         CeedCallBackend(CeedDestroy(&basis_ceed));
2166       }
2167       CeedCallBackend(CeedBasisDestroy(&basis));
2168     }
2169 
2170     for (CeedInt i = 0; i < num_output_fields; i++) {
2171       CeedBasis basis;
2172 
2173       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
2174       if (basis != CEED_BASIS_NONE) {
2175         bool        is_tensor = true;
2176         const char *resource;
2177         char       *resource_root;
2178         Ceed        basis_ceed;
2179 
2180         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2181         is_all_tensor    = is_all_tensor && is_tensor;
2182         is_all_nontensor = is_all_nontensor && !is_tensor;
2183 
2184         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2185         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2186         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2187         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2188         CeedCallBackend(CeedFree(&resource_root));
2189         CeedCallBackend(CeedDestroy(&basis_ceed));
2190       }
2191       CeedCallBackend(CeedBasisDestroy(&basis));
2192     }
2193   }
2194 
2195   // Retrieve operator data
2196   CeedCallBackend(CeedOperatorGetData(op, &data));
2197   Q       = data->Q;
2198   Q_1d    = data->Q_1d;
2199   max_dim = data->dim;
2200   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
2201   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
2202   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
2203 
2204   // Load basis source files
2205   if (!is_all_nontensor) {
2206     code << tab << "// Tensor basis source\n";
2207     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
2208   }
2209   if (!is_all_tensor) {
2210     code << tab << "// Non-tensor basis source\n";
2211     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
2212   }
2213   if (!is_all_tensor && !is_all_nontensor) {
2214     code << "// Tensor basis source\n";
2215     code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
2216   }
2217   code << "// CodeGen operator source\n";
2218   code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
2219 
2220   // Get QFunction name
2221   std::string qfunction_name(qf_data->qfunction_name);
2222   std::string operator_name;
2223 
2224   operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name;
2225 
2226   // Define CEED_Q_VLA
2227   code << "\n" << tab << "#undef CEED_Q_VLA\n";
2228   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
2229     code << tab << "#define CEED_Q_VLA 1\n\n";
2230   } else {
2231     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
2232   }
2233 
2234   // Add user QFunction source
2235   {
2236     const char *source_path;
2237 
2238     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
2239     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
2240 
2241     code << tab << "// User QFunction source\n";
2242     code << tab << "#include \"" << source_path << "\"\n\n";
2243   }
2244 
2245   // Setup
2246   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
2247   code << tab << "// Operator Assembly Kernel\n";
2248   code << tab << "// \n";
2249   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
2250   code << tab << "// r_[in,out]_e_i: Element vector register\n";
2251   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
2252   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
2253   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
2254   code << tab << "// \n";
2255   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
2256   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
2257   code << tab << "// -----------------------------------------------------------------------------\n";
2258   code << tab << "extern \"C\" __global__ void " << operator_name
2259        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
2260           "points, CeedScalar *__restrict__ values_array) {\n";
2261   tab.push();
2262 
2263   // Scratch buffers
2264   for (CeedInt i = 0; i < num_input_fields; i++) {
2265     CeedEvalMode eval_mode;
2266 
2267     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2268     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
2269       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
2270     }
2271   }
2272   for (CeedInt i = 0; i < num_output_fields; i++) {
2273     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
2274   }
2275 
2276   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
2277   if (!is_all_tensor) {
2278     code << tab << "const CeedInt Q = " << Q << ";\n";
2279   }
2280   if (!is_all_nontensor) {
2281     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
2282   }
2283 
2284   // Shared data
2285   code << tab << "extern __shared__ CeedScalar slice[];\n";
2286   code << tab << "SharedData_Hip data;\n";
2287   code << tab << "data.t_id_x = threadIdx.x;\n";
2288   code << tab << "data.t_id_y = threadIdx.y;\n";
2289   code << tab << "data.t_id_z = threadIdx.z;\n";
2290   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
2291   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
2292 
2293   // -- Determine input mat reuse
2294   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
2295 
2296   for (CeedInt i = 0; i < num_input_fields; i++) {
2297     input_matrix_reuse[i].index = -1;
2298   }
2299   for (CeedInt i = 0; i < num_input_fields; i++) {
2300     bool         is_tensor = true;
2301     CeedEvalMode eval_mode_i;
2302     CeedBasis    basis_i;
2303 
2304     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
2305     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
2306     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
2307     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2308     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
2309       CeedEvalMode eval_mode_j;
2310       CeedBasis    basis_j;
2311 
2312       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2313       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2314       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2315       if (basis_i == basis_j) {
2316         if (is_tensor) {
2317           input_matrix_reuse[i].index     = j;
2318           input_matrix_reuse[i].is_input  = true;
2319           input_matrix_reuse[i].eval_mode = eval_mode_j;
2320         } else {
2321           // For non-tensor can only re-use with the same eval mode
2322           if (eval_mode_i == eval_mode_j) {
2323             input_matrix_reuse[i].index     = j;
2324             input_matrix_reuse[i].is_input  = true;
2325             input_matrix_reuse[i].eval_mode = eval_mode_j;
2326           }
2327         }
2328       }
2329       CeedCallBackend(CeedBasisDestroy(&basis_j));
2330     }
2331     CeedCallBackend(CeedBasisDestroy(&basis_i));
2332   }
2333 
2334   // -- Determine output mat reuse
2335   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
2336 
2337   for (CeedInt i = 0; i < num_output_fields; i++) {
2338     output_matrix_reuse[i].index = -1;
2339   }
2340   for (CeedInt i = 0; i < num_output_fields; i++) {
2341     bool         is_tensor = true;
2342     CeedEvalMode eval_mode_i;
2343     CeedBasis    basis_i;
2344 
2345     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
2346     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
2347     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2348     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
2349       CeedEvalMode eval_mode_j;
2350       CeedBasis    basis_j;
2351 
2352       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2353       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2354       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2355       if (basis_i == basis_j) {
2356         if (is_tensor) {
2357           output_matrix_reuse[i].index     = j;
2358           output_matrix_reuse[i].is_input  = true;
2359           output_matrix_reuse[i].eval_mode = eval_mode_j;
2360         } else {
2361           // For non-tensor can only re-use with the same eval mode
2362           if (eval_mode_i == eval_mode_j) {
2363             output_matrix_reuse[i].index     = j;
2364             output_matrix_reuse[i].is_input  = true;
2365             output_matrix_reuse[i].eval_mode = eval_mode_j;
2366           }
2367         }
2368       }
2369       CeedCallBackend(CeedBasisDestroy(&basis_j));
2370     }
2371     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
2372       CeedEvalMode eval_mode_j;
2373       CeedBasis    basis_j;
2374 
2375       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
2376       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2377       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
2378       if (basis_i == basis_j) {
2379         if (is_tensor) {
2380           output_matrix_reuse[i].index     = j;
2381           output_matrix_reuse[i].is_input  = false;
2382           output_matrix_reuse[i].eval_mode = eval_mode_j;
2383         } else {
2384           // For non-tensor can only re-use with the same eval mode
2385           if (eval_mode_i == eval_mode_j) {
2386             output_matrix_reuse[i].index     = j;
2387             output_matrix_reuse[i].is_input  = false;
2388             output_matrix_reuse[i].eval_mode = eval_mode_j;
2389           }
2390         }
2391       }
2392       CeedCallBackend(CeedBasisDestroy(&basis_j));
2393     }
2394     CeedCallBackend(CeedBasisDestroy(&basis_i));
2395   }
2396 
2397   // Initialize constants, and matrices B and G
2398   code << "\n" << tab << "// Input field constants and basis data\n";
2399   for (CeedInt i = 0; i < num_input_fields; i++) {
2400     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
2401                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true));
2402   }
2403   code << "\n" << tab << "// Output field constants and basis data\n";
2404   for (CeedInt i = 0; i < num_output_fields; i++) {
2405     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
2406                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true));
2407   }
2408 
2409   // Loop over all elements
2410   code << "\n" << tab << "// Element loop\n";
2411   code << tab << "__syncthreads();\n";
2412   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
2413   tab.push();
2414 
2415   // -- Compute minimum buffer space needed
2416   CeedInt max_rstr_buffer_size = 1;
2417 
2418   for (CeedInt i = 0; i < num_input_fields; i++) {
2419     CeedEvalMode eval_mode;
2420 
2421     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2422     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
2423       CeedInt             num_comp;
2424       CeedElemRestriction elem_rstr;
2425 
2426       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
2427       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2428       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2429       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2430     }
2431   }
2432   for (CeedInt i = 0; i < num_output_fields; i++) {
2433     CeedEvalMode eval_mode;
2434 
2435     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
2436     if (eval_mode != CEED_EVAL_NONE) {
2437       CeedInt             num_comp;
2438       CeedElemRestriction elem_rstr;
2439 
2440       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2441       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2442       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2443       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2444     }
2445   }
2446   code << tab << "// Scratch restriction buffer space\n";
2447   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
2448 
2449   // -- Determine best input field processing order
2450   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
2451 
2452   for (CeedInt i = 0; i < num_input_fields; i++) {
2453     field_rstr_in_buffer[i] = -1;
2454     input_field_order[i]    = -1;
2455   }
2456   {
2457     bool    is_ordered[CEED_FIELD_MAX];
2458     CeedInt curr_index = 0;
2459 
2460     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
2461     for (CeedInt i = 0; i < num_input_fields; i++) {
2462       CeedVector          vec_i;
2463       CeedElemRestriction rstr_i;
2464 
2465       if (is_ordered[i]) continue;
2466       field_rstr_in_buffer[i]       = i;
2467       is_ordered[i]                 = true;
2468       input_field_order[curr_index] = i;
2469       curr_index++;
2470       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
2471       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
2472       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
2473       for (CeedInt j = i + 1; j < num_input_fields; j++) {
2474         CeedVector          vec_j;
2475         CeedElemRestriction rstr_j;
2476 
2477         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
2478         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
2479         if (rstr_i == rstr_j && vec_i == vec_j) {
2480           field_rstr_in_buffer[j]       = i;
2481           is_ordered[j]                 = true;
2482           input_field_order[curr_index] = j;
2483           curr_index++;
2484         }
2485         CeedCallBackend(CeedVectorDestroy(&vec_j));
2486         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
2487       }
2488       CeedCallBackend(CeedVectorDestroy(&vec_i));
2489       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
2490     }
2491   }
2492 
2493   // -- Input restriction and basis
2494   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
2495   CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0;
2496   CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX];
2497 
2498   for (CeedInt i = 0; i < num_input_fields; i++) {
2499     bool          is_active = false;
2500     const char   *field_name;
2501     const CeedInt f = input_field_order[i];
2502 
2503     {
2504       CeedVector vec;
2505 
2506       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
2507       is_active = vec == CEED_VECTOR_ACTIVE;
2508       CeedCallBackend(CeedVectorDestroy(&vec));
2509     }
2510 
2511     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
2512     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
2513 
2514     if (is_active) {
2515       CeedEvalMode eval_mode;
2516       CeedInt      field_size;
2517 
2518       active_fields_in[num_active_in] = f;
2519       num_active_in++;
2520       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
2521       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
2522       if (eval_mode == CEED_EVAL_GRAD) {
2523         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
2524              << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2525       } else {
2526         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2527       }
2528       code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n";
2529     } else {
2530       // ---- Restriction
2531       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
2532                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
2533 
2534       // ---- Basis action
2535       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
2536                                                            is_all_tensor, is_at_points, use_3d_slices));
2537     }
2538   }
2539   code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {";
2540   for (CeedInt i = 0; i < num_active_in; i++) {
2541     code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2542   }
2543   code << "};\n";
2544   code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {";
2545   for (CeedInt i = 0; i < num_active_in; i++) {
2546     code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2547   }
2548   code << "};\n";
2549 
2550   for (CeedInt i = 0; i < num_output_fields; i++) {
2551     bool is_active = false;
2552 
2553     {
2554       CeedVector vec;
2555 
2556       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2557       is_active = vec == CEED_VECTOR_ACTIVE;
2558       CeedCallBackend(CeedVectorDestroy(&vec));
2559     }
2560     if (is_active) {
2561       const char *field_name;
2562       CeedInt     field_size;
2563 
2564       active_fields_out[num_active_out] = i;
2565       num_active_out++;
2566       CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2567       qf_assembly_size_out += field_size;
2568       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2569       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2570       code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n";
2571     }
2572   }
2573   code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {";
2574   for (CeedInt i = 0; i < num_active_out; i++) {
2575     code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : "");
2576   }
2577   code << "};\n";
2578   code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n";
2579 
2580   // -- Loop over active field
2581   code << "\n" << tab << "CeedInt input_offset = 0;\n";
2582   code << tab << "// Loop over active QFunction input fields\n";
2583   code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n";
2584   code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n";
2585   tab.push();
2586 
2587   // -- Loop over size of active field
2588   code << "\n" << tab << "// Loop over current active input field size\n";
2589   code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n";
2590   code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n";
2591   tab.push();
2592 
2593   // -- Set current active point and component to 1
2594   code << tab << "// Set current active point and component to 1.0\n";
2595   if (is_all_tensor && (max_dim >= 3)) {
2596     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n";
2597   } else {
2598     code << tab << "r_q_in[a][s] = 1.0;\n";
2599   }
2600 
2601   // -- Q function
2602   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
2603                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
2604                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
2605 
2606   // -- Output basis and restriction
2607   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
2608   CeedScalar offset = 0;
2609 
2610   for (CeedInt i = 0; i < num_output_fields; i++) {
2611     bool        is_active = false;
2612     const char *field_name;
2613 
2614     {
2615       CeedVector vec;
2616 
2617       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2618       is_active = vec == CEED_VECTOR_ACTIVE;
2619       CeedCallBackend(CeedVectorDestroy(&vec));
2620     }
2621     if (!is_active) continue;
2622 
2623     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2624     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2625 
2626     // ---- Restriction
2627     CeedInt field_size;
2628 
2629     code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
2630          << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
2631     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2632     offset += field_size;
2633   }
2634 
2635   // -- Reset current active node and component
2636   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2637   if (is_all_tensor && (max_dim >= 3)) {
2638     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n";
2639   } else {
2640     code << tab << "r_q_in[a][s] = 0.0;\n";
2641   }
2642 
2643   // -- End of loop over size of active field
2644   tab.pop();
2645   code << tab << "}\n";
2646   code << tab << "input_offset += field_size_in;\n";
2647 
2648   // -- End of loop over active field
2649   tab.pop();
2650   code << tab << "}\n";
2651 
2652   // Close loop and function
2653   tab.pop();
2654   code << tab << "}\n";
2655   tab.pop();
2656   code << tab << "}\n";
2657   code << tab << "// -----------------------------------------------------------------------------\n\n";
2658 
2659   CeedInt block_sizes[3] = {0, 0, 0};
2660   CeedInt num_elem;
2661 
2662   // Compile
2663   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2664   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2665   {
2666     bool is_compile_good = false;
2667 
2668     data->thread_1d = block_sizes[0];
2669     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0],
2670                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2671     if (is_compile_good) {
2672       *is_good_build = true;
2673       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
2674     } else {
2675       *is_good_build              = false;
2676       data->use_assembly_fallback = true;
2677     }
2678   }
2679   CeedCallBackend(CeedDestroy(&ceed));
2680   CeedCallBackend(CeedQFunctionDestroy(&qf));
2681   return CEED_ERROR_SUCCESS;
2682 }
2683 
2684 //------------------------------------------------------------------------------
2685