xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 0183ed61035d97ff853cf8c8e722c0fda76e54df)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/gen-tools.h>
13 #include <ceed/jit-tools.h>
14 
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 
19 #include "../hip-ref/ceed-hip-ref.h"
20 #include "../hip-shared/ceed-hip-shared.h"
21 #include "../hip/ceed-hip-common.h"
22 #include "../hip/ceed-hip-compile.h"
23 #include "ceed-hip-gen.h"
24 
25 struct FieldReuse_Hip {
26   CeedInt      index;
27   bool         is_input;
28   CeedEvalMode eval_mode;
29 };
30 
31 //------------------------------------------------------------------------------
32 // Calculate the block size used for launching the operator kernel
33 //------------------------------------------------------------------------------
34 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
35   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
36   if (dim == 1) {
37     CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
38 
39     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
40     block_sizes[0]  = thread_1d;
41     block_sizes[1]  = 1;
42     block_sizes[2]  = elems_per_block;
43   } else if (dim == 2) {
44     const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2;
45 
46     block_sizes[0] = thread_1d;
47     block_sizes[1] = thread_1d;
48     block_sizes[2] = elems_per_block;
49   } else if (dim == 3) {
50     const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1);
51 
52     block_sizes[0] = thread_1d;
53     block_sizes[1] = thread_1d;
54     block_sizes[2] = elems_per_block;
55   }
56   return CEED_ERROR_SUCCESS;
57 }
58 
59 //------------------------------------------------------------------------------
60 // Determine type of operator
61 //------------------------------------------------------------------------------
62 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
63                                                CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
64                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
65                                                CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
66   // Check if all are tensor
67   *is_all_tensor = true;
68   for (CeedInt i = 0; i < num_input_fields; i++) {
69     CeedBasis basis;
70 
71     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
72     if (basis != CEED_BASIS_NONE) {
73       bool is_field_tensor;
74 
75       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
76       *is_all_tensor = *is_all_tensor && is_field_tensor;
77     }
78     CeedCallBackend(CeedBasisDestroy(&basis));
79   }
80   for (CeedInt i = 0; i < num_output_fields; i++) {
81     CeedBasis basis;
82 
83     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
84     if (basis != CEED_BASIS_NONE) {
85       bool is_field_tensor;
86 
87       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
88       *is_all_tensor = *is_all_tensor && is_field_tensor;
89     }
90     CeedCallBackend(CeedBasisDestroy(&basis));
91   }
92 
93   // Find max_P, max_P_1d, Q, and Q_1d
94   bool is_all_3d = true;
95 
96   *max_P    = 0;
97   *max_P_1d = 0;
98   *Q        = 0;
99   *Q_1d     = 0;
100   for (CeedInt i = 0; i < num_input_fields; i++) {
101     CeedBasis basis;
102 
103     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
104     if (basis != CEED_BASIS_NONE) {
105       bool    is_field_tensor;
106       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
107 
108       // Check if 3D
109       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
110       is_all_3d = is_all_3d && (field_dim == 3);
111       *max_dim  = CeedIntMax(*max_dim, field_dim);
112 
113       // Collect P, P_1d, Q, and Q_1d
114       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
115       *max_P = CeedIntMax(*max_P, field_P);
116       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
117       if (is_field_tensor) {
118         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
119         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
120       }
121       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
122       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
123       *Q = field_Q;
124       if (is_field_tensor) {
125         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
126         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
127         *Q_1d = field_Q_1d;
128       }
129     }
130     CeedCallBackend(CeedBasisDestroy(&basis));
131   }
132   for (CeedInt i = 0; i < num_output_fields; i++) {
133     CeedBasis basis;
134 
135     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
136     if (basis != CEED_BASIS_NONE) {
137       bool    is_field_tensor;
138       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
139 
140       // Check if 3D
141       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
142       is_all_3d = is_all_3d && (field_dim == 3);
143       *max_dim  = CeedIntMax(*max_dim, field_dim);
144 
145       // Collect P, P_1d, Q, and Q_1d
146       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
147       *max_P = CeedIntMax(*max_P, field_P);
148       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
149       if (is_field_tensor) {
150         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
151         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
152       }
153       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
154       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
155       *Q = field_Q;
156       if (is_field_tensor) {
157         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
158         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
159         *Q_1d = field_Q_1d;
160       }
161     }
162     CeedCallBackend(CeedBasisDestroy(&basis));
163   }
164 
165   // Only use 3D collocated gradient parallelization strategy when gradient is computed
166   *use_3d_slices = false;
167   if (is_all_3d && *is_all_tensor) {
168     bool was_grad_found = false;
169 
170     for (CeedInt i = 0; i < num_input_fields; i++) {
171       CeedEvalMode eval_mode;
172 
173       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
174       if (eval_mode == CEED_EVAL_GRAD) {
175         CeedBasis_Hip_shared *basis_data;
176         CeedBasis             basis;
177 
178         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
179         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
180         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
181         was_grad_found = true;
182         CeedCallBackend(CeedBasisDestroy(&basis));
183       }
184     }
185     for (CeedInt i = 0; i < num_output_fields; i++) {
186       CeedEvalMode eval_mode;
187 
188       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
189       if (eval_mode == CEED_EVAL_GRAD) {
190         CeedBasis_Hip_shared *basis_data;
191         CeedBasis             basis;
192 
193         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
194         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
195         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
196         was_grad_found = true;
197         CeedCallBackend(CeedBasisDestroy(&basis));
198       }
199     }
200   }
201   return CEED_ERROR_SUCCESS;
202 }
203 
204 //------------------------------------------------------------------------------
205 // Setup fields
206 //------------------------------------------------------------------------------
207 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
208                                                     CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
209                                                     CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
210                                                     bool use_3d_slices) {
211   bool      is_tensor = true;
212   CeedBasis basis;
213   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
214   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
215 
216   const char           *field_name;
217   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
218   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
219   std::string           option_name = (is_input ? "inputs" : "outputs");
220   CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
221   CeedInt               elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
222   CeedElemRestriction   elem_rstr;
223   CeedBasis_Hip_shared *basis_data;
224 
225   // Field reuse info
226   bool use_previous_field = field_reuse.index != -1;
227 
228   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
229   code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
230 
231   // Get field data
232   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
233   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
234     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
235     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
236   }
237   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
238   if (basis != CEED_BASIS_NONE) {
239     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
240     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
241     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
242     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
243   }
244   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
245 
246   // Set field constants
247   code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
248   if (is_tensor && !is_all_tensor) {
249     CeedInt P = 0;
250 
251     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
252     code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
253   }
254   code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
255   if (eval_mode != CEED_EVAL_WEIGHT) {
256     code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
257   }
258 
259   // Load basis data
260   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
261   switch (eval_mode) {
262     case CEED_EVAL_NONE:
263       break;
264     case CEED_EVAL_INTERP:
265       if (is_at_points) {
266         // AtPoints
267         if (!basis_data->d_chebyshev_interp_1d) {
268           CeedSize    interp_bytes;
269           CeedScalar *chebyshev_interp_1d;
270 
271           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
272           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
273           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
274           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
275           CeedCallHip(CeedBasisReturnCeed(basis),
276                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
277           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
278         }
279         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
280         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
281       } else {
282         // Standard quadrature
283         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
284         else data->B.outputs[i] = basis_data->d_interp_1d;
285       }
286       if (use_previous_field) {
287         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
288 
289         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
290       } else {
291         code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
292         code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
293       }
294       break;
295     case CEED_EVAL_GRAD:
296       if (is_at_points) {
297         // AtPoints
298         if (!basis_data->d_chebyshev_interp_1d) {
299           CeedSize    interp_bytes;
300           CeedScalar *chebyshev_interp_1d;
301 
302           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
303           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
304           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
305           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
306           CeedCallHip(CeedBasisReturnCeed(basis),
307                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
308           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
309         }
310         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
311         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
312       } else {
313         // Standard quadrature
314         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
315         else data->B.outputs[i] = basis_data->d_interp_1d;
316       }
317       if (is_tensor) {
318         if (use_previous_field) {
319           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
320 
321           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
322         } else {
323           code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
324           code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
325         }
326       }
327       if (is_at_points) break;  // No G mat for AtPoints
328       if (use_3d_slices) {
329         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
330         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
331         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
332           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
333 
334           code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
335         } else {
336           code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
337           code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
338         }
339       } else {
340         bool has_collo_grad = basis_data->d_collo_grad_1d;
341 
342         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
343         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
344         if (has_collo_grad) {
345           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
346             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
347 
348             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
349           } else {
350             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
351             code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
352           }
353         } else {
354           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
355             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
356 
357             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
358           } else {
359             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
360                  << (is_tensor ? "" : var_suffix) << "];\n";
361             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
362                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
363           }
364         }
365       }
366       break;
367     case CEED_EVAL_WEIGHT:
368       break;  // No action
369       // LCOV_EXCL_START
370     case CEED_EVAL_DIV:
371     case CEED_EVAL_CURL:
372       break;  // TODO: Not implemented
373               // LCOV_EXCL_STOP
374   }
375   CeedCallBackend(CeedBasisDestroy(&basis));
376   return CEED_ERROR_SUCCESS;
377 }
378 
379 //------------------------------------------------------------------------------
380 // Restriction
381 //------------------------------------------------------------------------------
382 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
383                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
384                                                       CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
385                                                       bool use_3d_slices) {
386   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
387   std::string              P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
388   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
389   CeedInt                  elem_size = 0, num_comp = 0;
390   CeedSize                 l_size;
391   CeedRestrictionType      rstr_type = CEED_RESTRICTION_STANDARD;
392   CeedElemRestriction_Hip *rstr_data;
393   CeedElemRestriction      elem_rstr;
394 
395   // Get field data
396   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
397   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
398     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
399     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
400     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
401     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
402   }
403   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
404 
405   // Restriction
406   if (is_input) {
407     // Input
408     if (field_input_buffer[i] != i) {
409       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
410 
411       // Restriction was already done for previous input
412       code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
413     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
414       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
415         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
416         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
417       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
418         // Otherwise we're using the scratch space
419         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
420       }
421       switch (rstr_type) {
422         case CEED_RESTRICTION_STANDARD: {
423           CeedInt comp_stride;
424 
425           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
426           code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
427           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
428           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
429           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
430           code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
431                << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
432                << ");\n";
433           break;
434         }
435         case CEED_RESTRICTION_STRIDED: {
436           bool    has_backend_strides;
437           CeedInt num_elem;
438 
439           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
440           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
441           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
442 
443           if (!has_backend_strides) {
444             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
445           }
446           code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
447                << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
448           code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
449                << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
450                << var_suffix << ");\n";
451           break;
452         }
453         case CEED_RESTRICTION_POINTS: {
454           CeedInt comp_stride;
455 
456           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
457           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
458           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
459           break;
460         }
461         // LCOV_EXCL_START
462         case CEED_RESTRICTION_ORIENTED:
463         case CEED_RESTRICTION_CURL_ORIENTED:
464           break;  // TODO: Not implemented
465                   // LCOV_EXCL_STOP
466       }
467     }
468   } else {
469     // Output
470     switch (rstr_type) {
471       case CEED_RESTRICTION_STANDARD: {
472         CeedInt comp_stride;
473 
474         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
475         code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
476         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
477         code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
478         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
479         code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
480              << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
481              << ");\n";
482         break;
483       }
484       case CEED_RESTRICTION_STRIDED: {
485         bool    has_backend_strides;
486         CeedInt num_elem;
487 
488         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
489         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
490         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
491 
492         if (!has_backend_strides) {
493           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
494         }
495         code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
496              << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
497         code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
498              << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
499              << ");\n";
500         break;
501       }
502       case CEED_RESTRICTION_POINTS:
503         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
504         break;
505       // LCOV_EXCL_START
506       case CEED_RESTRICTION_ORIENTED:
507       case CEED_RESTRICTION_CURL_ORIENTED:
508         break;  // TODO: Not implemented
509                 // LCOV_EXCL_STOP
510     }
511   }
512   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
513   return CEED_ERROR_SUCCESS;
514 }
515 
516 //------------------------------------------------------------------------------
517 // Basis
518 //------------------------------------------------------------------------------
519 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field,
520                                                 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
521                                                 bool is_at_points, bool use_3d_slices) {
522   bool      is_tensor = true;
523   CeedBasis basis;
524   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
525   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
526 
527   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
528   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
529   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
530   CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
531   CeedElemRestriction elem_rstr;
532 
533   // Get field data
534   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
535   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
536     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
537     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
538   }
539   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
540   if (basis != CEED_BASIS_NONE) {
541     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
542     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
543     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
544   }
545   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
546 
547   // Basis
548   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
549   if (is_input) {
550     switch (eval_mode) {
551       case CEED_EVAL_NONE:
552         if (!use_3d_slices && !is_at_points) {
553           code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
554         }
555         break;
556       case CEED_EVAL_INTERP:
557         if (is_at_points) {
558           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
559 
560           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
561           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
562                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
563         } else {
564           std::string function_name = is_tensor
565                                           ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
566                                           : "InterpNonTensor";
567           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
568 
569           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
570           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
571                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
572         }
573         break;
574       case CEED_EVAL_GRAD:
575         if (is_at_points) {
576           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
577 
578           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
579           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
580                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
581         } else if (use_3d_slices) {
582           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
583 
584           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
585           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
586                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
587         } else if (is_tensor) {
588           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
589           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
590                                       (is_all_tensor ? "" : "Flattened");
591           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
592 
593           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
594                << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
595           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
596                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
597         } else {
598           std::string function_name = "GradNonTensor";
599 
600           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
601           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
602                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
603         }
604         break;
605       case CEED_EVAL_WEIGHT: {
606         if (is_at_points) {
607           code << tab << "// Nothing to do AtPoints\n";
608         } else {
609           CeedBasis_Hip_shared *basis_data;
610           std::string           function_name = is_tensor
611                                                     ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
612                                                     : "WeightNonTensor";
613 
614           code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
615           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
616           data->W = basis_data->d_q_weight_1d;
617           code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
618         }
619         break;
620       }
621       // LCOV_EXCL_START
622       case CEED_EVAL_DIV:
623       case CEED_EVAL_CURL:
624         break;  // TODO: Not implemented
625                 // LCOV_EXCL_STOP
626     }
627   } else {
628     switch (eval_mode) {
629       case CEED_EVAL_NONE:
630         code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
631         break;  // No action
632       case CEED_EVAL_INTERP:
633         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
634         if (is_at_points) {
635           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
636 
637           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
638                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
639         } else {
640           std::string function_name =
641               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
642                         : "InterpTransposeNonTensor";
643           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
644 
645           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
646                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
647         }
648         break;
649       case CEED_EVAL_GRAD:
650         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
651         if (is_at_points) {
652           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
653 
654           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
655                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
656         } else if (use_3d_slices) {
657           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
658 
659           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
660                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
661         } else if (is_tensor) {
662           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
663           std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
664                                       std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
665           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
666 
667           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
668                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
669         } else {
670           std::string function_name = "GradTransposeNonTensor";
671 
672           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
673                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
674         }
675         break;
676       // LCOV_EXCL_START
677       case CEED_EVAL_WEIGHT:
678         break;  // Should not occur
679       case CEED_EVAL_DIV:
680       case CEED_EVAL_CURL:
681         break;  // TODO: Not implemented
682                 // LCOV_EXCL_STOP
683     }
684   }
685   CeedCallBackend(CeedBasisDestroy(&basis));
686   return CEED_ERROR_SUCCESS;
687 }
688 
689 //------------------------------------------------------------------------------
690 // QFunction
691 //------------------------------------------------------------------------------
692 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim,
693                                                     CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
694                                                     CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
695                                                     CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
696                                                     std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
697                                                     bool use_3d_slices) {
698   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
699   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
700   CeedElemRestriction elem_rstr;
701 
702   // Setup output arrays
703   code << "\n";
704   code << tab << "// -- Output field setup\n";
705   for (CeedInt i = 0; i < num_output_fields; i++) {
706     const char *field_name;
707     std::string var_suffix = "_out_" + std::to_string(i);
708 
709     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
710     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
711     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
712     switch (eval_mode) {
713       case CEED_EVAL_NONE:
714         if (is_at_points) {
715           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
716         } else {
717           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
718                << "];\n";
719         }
720         break;
721       case CEED_EVAL_INTERP:
722         if (is_at_points) {
723           // Accumulator for point data
724           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
725           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
726                << "[i] = 0.0;\n";
727         } else {
728           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
729                << "];\n";
730         }
731         break;
732       case CEED_EVAL_GRAD:
733         if (is_at_points) {
734           // Accumulator for point data
735           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
736           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
737                << "[i] = 0.0;\n";
738         } else if (use_3d_slices) {
739           // Accumulator for gradient slices
740           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
741           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
742         } else {
743           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
744                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
745         }
746         break;
747       case CEED_EVAL_WEIGHT:
748         break;
749         // LCOV_EXCL_START
750       case CEED_EVAL_DIV:
751       case CEED_EVAL_CURL:
752         break;  // TODO: Not implemented
753                 // LCOV_EXCL_STOP
754     }
755   }
756 
757   if (is_at_points) {
758     // We need to handle batches of points
759     code << "\n";
760     code << tab << "// Note: Using batches of points\n";
761     code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
762     code << tab << "#pragma unroll\n";
763     code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
764     tab.push();
765     code << tab << "const CeedInt p = i % max_num_points;\n\n";
766 
767     code << tab << "// -- Coordinates\n";
768     code << tab << "CeedScalar r_x[max_dim];\n";
769     code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
770 
771     code << tab << "// -- Input fields\n";
772     for (CeedInt i = 0; i < num_input_fields; i++) {
773       const char *field_name;
774       std::string var_suffix = "_in_" + std::to_string(i);
775       std::string P_name     = "P_1d" + var_suffix;
776 
777       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
778       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
779       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
780       // Basis action
781       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
782       switch (eval_mode) {
783         case CEED_EVAL_NONE:
784           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
785           code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
786                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
787           break;
788         case CEED_EVAL_INTERP:
789           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
790           code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
791                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
792           break;
793         case CEED_EVAL_GRAD:
794           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
795           code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
796                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
797           break;
798         case CEED_EVAL_WEIGHT:
799           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
800           code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
801           break;
802           // LCOV_EXCL_START
803         case CEED_EVAL_DIV:
804         case CEED_EVAL_CURL:
805           break;  // TODO: Not implemented
806                   // LCOV_EXCL_STOP
807       }
808     }
809     code << "\n";
810     code << tab << "// -- Output fields\n";
811     for (CeedInt i = 0; i < num_output_fields; i++) {
812       const char *field_name;
813       std::string var_suffix = "_out_" + std::to_string(i);
814 
815       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
816       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
817       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
818       // Basis action
819       switch (eval_mode) {
820         case CEED_EVAL_NONE:
821           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
822           break;
823         case CEED_EVAL_INTERP:
824           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
825           break;
826         case CEED_EVAL_GRAD:
827           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
828           break;
829           // LCOV_EXCL_START
830         case CEED_EVAL_WEIGHT:
831           break;  // Should not occur
832         case CEED_EVAL_DIV:
833         case CEED_EVAL_CURL:
834           break;  // TODO: Not implemented
835                   // LCOV_EXCL_STOP
836       }
837     }
838 
839   } else if (use_3d_slices) {
840     // We treat quadrature points per slice in 3d to save registers
841     code << "\n";
842     code << tab << "// Note: Using planes of 3D elements\n";
843     code << tab << "#pragma unroll\n";
844     code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
845     tab.push();
846     code << tab << "// -- Input fields\n";
847     for (CeedInt i = 0; i < num_input_fields; i++) {
848       const char *field_name;
849       std::string var_suffix = "_in_" + std::to_string(i);
850 
851       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
852       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
853       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
854       // Basis action
855       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
856       switch (eval_mode) {
857         case CEED_EVAL_NONE:
858           bool is_strided;
859 
860           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
861 
862           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
863           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
864           if (is_strided) {
865             bool    has_backend_strides;
866             CeedInt num_elem, elem_size;
867 
868             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
869             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
870             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
871             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
872 
873             if (!has_backend_strides) {
874               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
875             }
876             code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
877                  << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
878             code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
879                  << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
880           } else {
881             CeedSize                 l_size = 0;
882             CeedInt                  comp_stride;
883             CeedElemRestriction_Hip *rstr_data;
884 
885             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
886             code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
887             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
888             code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
889             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
890             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
891             code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
892                  << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
893           }
894           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
895           break;
896         case CEED_EVAL_INTERP:
897           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
898           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
899           tab.push();
900           code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
901           tab.pop();
902           code << tab << "}\n";
903           break;
904         case CEED_EVAL_GRAD:
905           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
906           code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
907                << var_suffix << ", r_s" << var_suffix << ");\n";
908           break;
909         case CEED_EVAL_WEIGHT:
910           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
911           code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
912           break;
913           // LCOV_EXCL_START
914         case CEED_EVAL_DIV:
915         case CEED_EVAL_CURL:
916           break;  // TODO: Not implemented
917                   // LCOV_EXCL_STOP
918       }
919     }
920     code << "\n";
921     code << tab << "// -- Output fields\n";
922     for (CeedInt i = 0; i < num_output_fields; i++) {
923       const char *field_name;
924       std::string var_suffix = "_out_" + std::to_string(i);
925 
926       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
927       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
928       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
929       // Basis action
930       switch (eval_mode) {
931         case CEED_EVAL_NONE:
932           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
933           break;
934         case CEED_EVAL_INTERP:
935           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
936           break;
937         case CEED_EVAL_GRAD:
938           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
939           break;
940           // LCOV_EXCL_START
941         case CEED_EVAL_WEIGHT:
942           break;  // Should not occur
943         case CEED_EVAL_DIV:
944         case CEED_EVAL_CURL:
945           break;  // TODO: Not implemented
946                   // LCOV_EXCL_STOP
947       }
948     }
949   } else {
950     code << "\n";
951     code << tab << "// Note: Using full elements\n";
952     code << tab << "{\n";
953     tab.push();
954     code << tab << "// -- Input fields\n";
955     for (CeedInt i = 0; i < num_input_fields; i++) {
956       const char *field_name;
957 
958       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
959       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
960       code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
961     }
962     code << tab << "// -- Output fields\n";
963     for (CeedInt i = 0; i < num_output_fields; i++) {
964       const char *field_name;
965 
966       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
967       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
968       code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
969     }
970   }
971 
972   // Input and output buffers
973   code << "\n";
974   code << tab << "// -- QFunction inputs and outputs\n";
975   code << tab << "// ---- Inputs\n";
976   code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
977   for (CeedInt i = 0; i < num_input_fields; i++) {
978     const char *field_name;
979 
980     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
981     code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
982     code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
983   }
984   code << tab << "// ---- Outputs\n";
985   code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
986   for (CeedInt i = 0; i < num_output_fields; i++) {
987     const char *field_name;
988 
989     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
990     code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
991     code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
992   }
993 
994   // Apply QFunction
995   code << "\n";
996   code << tab << "// -- Apply QFunction\n";
997   code << tab << "" << qfunction_name << "(ctx, ";
998   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
999     code << "1";
1000   } else {
1001     code << Q_name;
1002   }
1003   code << ", inputs, outputs);\n";
1004 
1005   if (is_at_points) {
1006     // Map back to coefficients
1007     code << "\n";
1008     code << tab << "// -- Output fields\n";
1009     for (CeedInt i = 0; i < num_output_fields; i++) {
1010       const char *field_name;
1011       std::string var_suffix = "_out_" + std::to_string(i);
1012       std::string P_name     = "P_1d" + var_suffix;
1013 
1014       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1015       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1016       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1017       // Basis action
1018       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1019       switch (eval_mode) {
1020         case CEED_EVAL_NONE: {
1021           CeedInt             comp_stride;
1022           CeedElemRestriction elem_rstr;
1023 
1024           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1025           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
1026           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1027           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
1028           code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
1029                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
1030                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
1031           break;
1032         }
1033         case CEED_EVAL_INTERP:
1034           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1035           tab.push();
1036           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1037           tab.pop();
1038           code << tab << "}\n";
1039           code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1040                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1041           break;
1042         case CEED_EVAL_GRAD:
1043           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1044           tab.push();
1045           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1046           tab.pop();
1047           code << tab << "}\n";
1048           code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1049                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1050           break;
1051           // LCOV_EXCL_START
1052         case CEED_EVAL_WEIGHT:
1053           break;  // Should not occur
1054         case CEED_EVAL_DIV:
1055         case CEED_EVAL_CURL:
1056           break;  // TODO: Not implemented
1057                   // LCOV_EXCL_STOP
1058       }
1059     }
1060   } else if (use_3d_slices) {
1061     // Copy or apply transpose grad, if needed
1062     code << "\n";
1063     code << tab << "// -- Output fields\n";
1064     for (CeedInt i = 0; i < num_output_fields; i++) {
1065       const char *field_name;
1066       std::string var_suffix = "_out_" + std::to_string(i);
1067       std::string P_name     = "P_1d" + var_suffix;
1068 
1069       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1070       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1071       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1072       // Basis action
1073       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1074       switch (eval_mode) {
1075         case CEED_EVAL_NONE:
1076           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1077           tab.push();
1078           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1079           tab.pop();
1080           code << tab << "}\n";
1081           break;
1082         case CEED_EVAL_INTERP:
1083           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1084           tab.push();
1085           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1086           tab.pop();
1087           code << tab << "}\n";
1088           break;
1089         case CEED_EVAL_GRAD:
1090           code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1091                << var_suffix << ", r_q" << var_suffix << ");\n";
1092           break;
1093           // LCOV_EXCL_START
1094         case CEED_EVAL_WEIGHT:
1095           break;  // Should not occur
1096         case CEED_EVAL_DIV:
1097         case CEED_EVAL_CURL:
1098           break;  // TODO: Not implemented
1099                   // LCOV_EXCL_STOP
1100       }
1101     }
1102   }
1103   tab.pop();
1104   code << tab << "}\n";
1105   return CEED_ERROR_SUCCESS;
1106 }
1107 
1108 //------------------------------------------------------------------------------
1109 // Build single operator kernel
1110 //------------------------------------------------------------------------------
1111 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
1112   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
1113   Ceed                   ceed;
1114   CeedInt                Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1115   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1116   CeedQFunction_Hip_gen *qf_data;
1117   CeedQFunction          qf;
1118   CeedOperatorField     *op_input_fields, *op_output_fields;
1119   CeedOperator_Hip_gen  *data;
1120   std::ostringstream     code;
1121   Tab                    tab;
1122 
1123   CeedCallBackend(CeedOperatorGetData(op, &data));
1124   {
1125     bool is_setup_done;
1126 
1127     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
1128     if (is_setup_done) {
1129       *is_good_build = !data->use_fallback;
1130       return CEED_ERROR_SUCCESS;
1131     }
1132   }
1133 
1134   // Check field compatibility
1135   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1136   {
1137     bool has_shared_bases = true;
1138 
1139     for (CeedInt i = 0; i < num_input_fields; i++) {
1140       CeedBasis basis;
1141 
1142       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1143       if (basis != CEED_BASIS_NONE) {
1144         bool        is_tensor = true;
1145         const char *resource;
1146         char       *resource_root;
1147         Ceed        basis_ceed;
1148 
1149         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1150         is_all_tensor    = is_all_tensor && is_tensor;
1151         is_all_nontensor = is_all_nontensor && !is_tensor;
1152         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1153         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1154         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1155         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1156         CeedCallBackend(CeedFree(&resource_root));
1157         CeedCallBackend(CeedDestroy(&basis_ceed));
1158       }
1159       CeedCallBackend(CeedBasisDestroy(&basis));
1160     }
1161 
1162     for (CeedInt i = 0; i < num_output_fields; i++) {
1163       CeedBasis basis;
1164 
1165       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1166       if (basis != CEED_BASIS_NONE) {
1167         bool        is_tensor = true;
1168         const char *resource;
1169         char       *resource_root;
1170         Ceed        basis_ceed;
1171 
1172         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1173         is_all_tensor    = is_all_tensor && is_tensor;
1174         is_all_nontensor = is_all_nontensor && !is_tensor;
1175 
1176         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1177         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1178         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1179         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1180         CeedCallBackend(CeedFree(&resource_root));
1181         CeedCallBackend(CeedDestroy(&basis_ceed));
1182       }
1183       CeedCallBackend(CeedBasisDestroy(&basis));
1184     }
1185     // -- Fallback to ref if not all bases are shared
1186     if (!has_shared_bases) {
1187       *is_good_build = false;
1188       return CEED_ERROR_SUCCESS;
1189     }
1190   }
1191   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1192   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1193   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1194   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1195 
1196   // Get operator data
1197   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1198   {
1199     CeedInt max_P = 0, max_P_1d = 0;
1200 
1201     CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
1202                                                         qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));
1203     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
1204   }
1205   if (max_dim == 0) max_dim = 1;
1206   data->dim = max_dim;
1207   if (is_at_points) {
1208     CeedElemRestriction_Hip *rstr_data;
1209     CeedElemRestriction      rstr_points = NULL;
1210 
1211     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1212     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1213     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1214     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1215     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1216     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1217   }
1218   if (is_at_points) use_3d_slices = false;
1219   if (Q_1d == 0) {
1220     if (is_at_points) Q_1d = max_num_points;
1221     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1222   }
1223   if (Q == 0) Q = Q_1d;
1224   data->Q    = Q;
1225   data->Q_1d = Q_1d;
1226 
1227   // Check for restriction only identity operator
1228   {
1229     bool is_identity_qf;
1230 
1231     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1232     if (is_identity_qf) {
1233       CeedEvalMode eval_mode_in, eval_mode_out;
1234 
1235       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1236       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1237       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1238                 "Backend does not implement restriction only identity operators");
1239     }
1240   }
1241 
1242   // Load basis source files
1243   if (!is_all_nontensor) {
1244     code << tab << "// Tensor basis source\n";
1245     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1246   }
1247   if (!is_all_tensor) {
1248     code << tab << "// Non-tensor basis source\n";
1249     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
1250   }
1251   if (is_at_points) {
1252     code << tab << "// AtPoints basis source\n";
1253     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1254   }
1255   if (!is_all_tensor && !is_all_nontensor) {
1256     code << tab << "// Tensor basis source\n";
1257     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
1258   }
1259   code << tab << "// CodeGen operator source\n";
1260   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1261 
1262   // Get QFunction name
1263   std::string qfunction_name(qf_data->qfunction_name);
1264   std::string operator_name;
1265 
1266   operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
1267 
1268   // Define CEED_Q_VLA
1269   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1270   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1271     code << tab << "#define CEED_Q_VLA 1\n\n";
1272   } else {
1273     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1274   }
1275 
1276   // Add user QFunction source
1277   {
1278     const char *source_path;
1279 
1280     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1281     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1282 
1283     code << tab << "// User QFunction source\n";
1284     code << tab << "#include \"" << source_path << "\"\n\n";
1285   }
1286 
1287   // Setup
1288   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1289   code << tab << "// Operator Kernel\n";
1290   code << tab << "// \n";
1291   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1292   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1293   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1294   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1295   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1296   code << tab << "// \n";
1297   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1298   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1299   code << tab << "// -----------------------------------------------------------------------------\n";
1300   code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
1301   code << "__global__ void " << operator_name
1302        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
1303   tab.push();
1304 
1305   // Scratch buffers
1306   for (CeedInt i = 0; i < num_input_fields; i++) {
1307     CeedEvalMode eval_mode;
1308 
1309     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1310     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1311       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1312     }
1313   }
1314   for (CeedInt i = 0; i < num_output_fields; i++) {
1315     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1316   }
1317 
1318   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1319   if (!is_all_tensor) {
1320     code << tab << "const CeedInt Q = " << Q << ";\n";
1321   }
1322   if (!is_all_nontensor) {
1323     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1324   }
1325   if (is_at_points) {
1326     code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1327     code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1328   }
1329 
1330   // Shared data
1331   code << tab << "extern __shared__ CeedScalar slice[];\n";
1332   code << tab << "SharedData_Hip data;\n";
1333   code << tab << "data.t_id_x = threadIdx.x;\n";
1334   code << tab << "data.t_id_y = threadIdx.y;\n";
1335   code << tab << "data.t_id_z = threadIdx.z;\n";
1336   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1337   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1338 
1339   // -- Determine input mat reuse
1340   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1341 
1342   for (CeedInt i = 0; i < num_input_fields; i++) {
1343     input_matrix_reuse[i].index = -1;
1344   }
1345   for (CeedInt i = 0; i < num_input_fields; i++) {
1346     bool         is_tensor = true;
1347     CeedEvalMode eval_mode_i;
1348     CeedBasis    basis_i;
1349 
1350     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1351     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1352     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1353     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1354     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1355       CeedEvalMode eval_mode_j;
1356       CeedBasis    basis_j;
1357 
1358       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1359       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1360       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1361       if (basis_i == basis_j) {
1362         if (is_tensor) {
1363           input_matrix_reuse[i].index     = j;
1364           input_matrix_reuse[i].is_input  = true;
1365           input_matrix_reuse[i].eval_mode = eval_mode_j;
1366         } else {
1367           // For non-tensor can only re-use with the same eval mode
1368           if (eval_mode_i == eval_mode_j) {
1369             input_matrix_reuse[i].index     = j;
1370             input_matrix_reuse[i].is_input  = true;
1371             input_matrix_reuse[i].eval_mode = eval_mode_j;
1372           }
1373         }
1374       }
1375       CeedCallBackend(CeedBasisDestroy(&basis_j));
1376     }
1377     CeedCallBackend(CeedBasisDestroy(&basis_i));
1378   }
1379 
1380   // -- Determine output mat reuse
1381   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1382 
1383   for (CeedInt i = 0; i < num_output_fields; i++) {
1384     output_matrix_reuse[i].index = -1;
1385   }
1386   for (CeedInt i = 0; i < num_output_fields; i++) {
1387     bool         is_tensor = true;
1388     CeedEvalMode eval_mode_i;
1389     CeedBasis    basis_i;
1390 
1391     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1392     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1393     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1394       CeedEvalMode eval_mode_j;
1395       CeedBasis    basis_j;
1396 
1397       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1398       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1399       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1400       if (basis_i == basis_j) {
1401         if (is_tensor) {
1402           output_matrix_reuse[i].index     = j;
1403           output_matrix_reuse[i].is_input  = true;
1404           output_matrix_reuse[i].eval_mode = eval_mode_j;
1405         } else {
1406           // For non-tensor can only re-use with the same eval mode
1407           if (eval_mode_i == eval_mode_j) {
1408             output_matrix_reuse[i].index     = j;
1409             output_matrix_reuse[i].is_input  = true;
1410             output_matrix_reuse[i].eval_mode = eval_mode_j;
1411           }
1412         }
1413       }
1414       CeedCallBackend(CeedBasisDestroy(&basis_j));
1415     }
1416     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1417       CeedEvalMode eval_mode_j;
1418       CeedBasis    basis_j;
1419 
1420       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1421       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1422       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1423       CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1424       if (basis_i == basis_j) {
1425         if (is_tensor) {
1426           output_matrix_reuse[i].index     = j;
1427           output_matrix_reuse[i].is_input  = false;
1428           output_matrix_reuse[i].eval_mode = eval_mode_j;
1429         } else {
1430           // For non-tensor can only re-use with the same eval mode
1431           if (eval_mode_i == eval_mode_j) {
1432             output_matrix_reuse[i].index     = j;
1433             output_matrix_reuse[i].is_input  = false;
1434             output_matrix_reuse[i].eval_mode = eval_mode_j;
1435           }
1436         }
1437       }
1438       CeedCallBackend(CeedBasisDestroy(&basis_j));
1439     }
1440     CeedCallBackend(CeedBasisDestroy(&basis_i));
1441   }
1442 
1443   // Initialize constants, and matrices B and G
1444   code << "\n" << tab << "// Input field constants and basis data\n";
1445   for (CeedInt i = 0; i < num_input_fields; i++) {
1446     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1447                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1448   }
1449   code << "\n" << tab << "// Output field constants and basis data\n";
1450   for (CeedInt i = 0; i < num_output_fields; i++) {
1451     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1452                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1453   }
1454 
1455   // Loop over all elements
1456   code << "\n" << tab << "// Element loop\n";
1457   code << tab << "__syncthreads();\n";
1458   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1459   tab.push();
1460 
1461   // -- Compute minimum buffer space needed
1462   CeedInt max_rstr_buffer_size = 1;
1463 
1464   for (CeedInt i = 0; i < num_input_fields; i++) {
1465     CeedEvalMode eval_mode;
1466 
1467     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1468     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1469       CeedInt             num_comp;
1470       CeedElemRestriction elem_rstr;
1471 
1472       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1473       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1474       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1475       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1476     }
1477   }
1478   for (CeedInt i = 0; i < num_output_fields; i++) {
1479     CeedEvalMode eval_mode;
1480 
1481     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1482     if (eval_mode != CEED_EVAL_NONE) {
1483       CeedInt             num_comp;
1484       CeedElemRestriction elem_rstr;
1485 
1486       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1487       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1488       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1489       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1490     }
1491   }
1492   code << tab << "// Scratch restriction buffer space\n";
1493   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1494 
1495   // -- Determine best input field processing order
1496   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1497 
1498   for (CeedInt i = 0; i < num_input_fields; i++) {
1499     field_rstr_in_buffer[i] = -1;
1500     input_field_order[i]    = -1;
1501   }
1502   {
1503     bool    is_ordered[CEED_FIELD_MAX];
1504     CeedInt curr_index = 0;
1505 
1506     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1507     for (CeedInt i = 0; i < num_input_fields; i++) {
1508       CeedVector          vec_i;
1509       CeedElemRestriction rstr_i;
1510 
1511       if (is_ordered[i]) continue;
1512       field_rstr_in_buffer[i]       = i;
1513       is_ordered[i]                 = true;
1514       input_field_order[curr_index] = i;
1515       curr_index++;
1516       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1517       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1518       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1519       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1520         CeedVector          vec_j;
1521         CeedElemRestriction rstr_j;
1522 
1523         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1524         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1525         if (rstr_i == rstr_j && vec_i == vec_j) {
1526           field_rstr_in_buffer[j]       = i;
1527           is_ordered[j]                 = true;
1528           input_field_order[curr_index] = j;
1529           curr_index++;
1530         }
1531         CeedCallBackend(CeedVectorDestroy(&vec_j));
1532         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1533       }
1534       CeedCallBackend(CeedVectorDestroy(&vec_i));
1535       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1536     }
1537   }
1538 
1539   // -- Input restriction and basis
1540   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1541   for (CeedInt i = 0; i < num_input_fields; i++) {
1542     const char   *field_name;
1543     const CeedInt f = input_field_order[i];
1544 
1545     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1546     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1547 
1548     // ---- Restriction
1549     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1550                                                                max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1551 
1552     // ---- Basis action
1553     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1554                                                          is_all_tensor, is_at_points, use_3d_slices));
1555   }
1556 
1557   // -- Q function
1558   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1559                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1560                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
1561 
1562   // -- Output basis and restriction
1563   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
1564   for (CeedInt i = 0; i < num_output_fields; i++) {
1565     const char *field_name;
1566 
1567     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1568     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1569 
1570     // ---- Basis action
1571     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
1572                                                          is_all_tensor, is_at_points, use_3d_slices));
1573 
1574     // ---- Restriction
1575     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
1576                                                                false, is_all_tensor, is_at_points, use_3d_slices));
1577   }
1578 
1579   // Close loop and function
1580   tab.pop();
1581   code << tab << "}\n";
1582   tab.pop();
1583   code << tab << "}\n";
1584   code << tab << "// -----------------------------------------------------------------------------\n\n";
1585 
1586   CeedInt block_sizes[3] = {0, 0, 0};
1587   CeedInt num_elem;
1588 
1589   // Compile
1590   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
1591   CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
1592   if (is_at_points) block_sizes[2] = 1;
1593   {
1594     bool is_compile_good = false;
1595 
1596     data->thread_1d = block_sizes[0];
1597     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE",
1598                                        block_sizes[0] * block_sizes[1] * block_sizes[2]));
1599     if (is_compile_good) {
1600       *is_good_build = true;
1601       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
1602     } else {
1603       *is_good_build     = false;
1604       data->use_fallback = true;
1605     }
1606   }
1607   CeedCallBackend(CeedOperatorSetSetupDone(op));
1608   CeedCallBackend(CeedDestroy(&ceed));
1609   CeedCallBackend(CeedQFunctionDestroy(&qf));
1610   return CEED_ERROR_SUCCESS;
1611 }
1612 
1613 //------------------------------------------------------------------------------
1614 // Build AtPoints assembly operator kernel
1615 //------------------------------------------------------------------------------
1616 static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) {
1617   bool                   is_all_tensor = true, is_at_points = false, use_3d_slices = false;
1618   Ceed                   ceed;
1619   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1620   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1621   CeedQFunction_Hip_gen *qf_data;
1622   CeedQFunction          qf;
1623   CeedOperatorField     *op_input_fields, *op_output_fields;
1624   CeedOperator_Hip_gen  *data;
1625   std::ostringstream     code;
1626   Tab                    tab;
1627 
1628   // Check compatibility
1629   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1630   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1631   CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
1632 
1633   // Retrieve operator data
1634   CeedCallBackend(CeedOperatorGetData(op, &data));
1635   Q       = data->Q;
1636   Q_1d    = data->Q_1d;
1637   max_dim = data->dim;
1638   {
1639     CeedElemRestriction rstr_points = NULL;
1640 
1641     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1642     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1643     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1644     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1645   }
1646   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1647   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1648   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1649   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1650 
1651   // Load basis source files
1652   code << tab << "// Tensor basis source\n";
1653   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1654   code << tab << "// AtPoints basis source\n";
1655   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1656   code << tab << "// CodeGen operator source\n";
1657   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1658 
1659   // Get QFunction name
1660   std::string qfunction_name(qf_data->qfunction_name);
1661   std::string operator_name;
1662 
1663   if (is_full) {
1664     operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name;
1665   } else {
1666     operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name;
1667   }
1668 
1669   // Define CEED_Q_VLA
1670   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1671   code << tab << "#define CEED_Q_VLA 1\n\n";
1672 
1673   // Add user QFunction source
1674   {
1675     const char *source_path;
1676 
1677     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1678     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1679 
1680     code << tab << "// User QFunction source\n";
1681     code << tab << "#include \"" << source_path << "\"\n\n";
1682   }
1683 
1684   // Setup
1685   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1686   code << tab << "// Operator Assembly Kernel\n";
1687   code << tab << "// \n";
1688   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1689   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1690   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1691   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1692   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1693   code << tab << "// \n";
1694   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1695   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1696   code << tab << "// -----------------------------------------------------------------------------\n";
1697   code << tab << "extern \"C\" __global__ void " << operator_name
1698        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
1699           "points, CeedScalar *__restrict__ values_array) {\n";
1700   tab.push();
1701 
1702   // Scratch buffers
1703   for (CeedInt i = 0; i < num_input_fields; i++) {
1704     CeedEvalMode eval_mode;
1705 
1706     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1707     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1708       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1709     }
1710   }
1711   for (CeedInt i = 0; i < num_output_fields; i++) {
1712     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1713   }
1714 
1715   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1716   code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1717   code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1718   code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1719 
1720   // Shared data
1721   code << tab << "extern __shared__ CeedScalar slice[];\n";
1722   code << tab << "SharedData_Hip data;\n";
1723   code << tab << "data.t_id_x = threadIdx.x;\n";
1724   code << tab << "data.t_id_y = threadIdx.y;\n";
1725   code << tab << "data.t_id_z = threadIdx.z;\n";
1726   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1727   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1728 
1729   // -- Determine input mat reuse
1730   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1731 
1732   for (CeedInt i = 0; i < num_input_fields; i++) {
1733     input_matrix_reuse[i].index = -1;
1734   }
1735   for (CeedInt i = 0; i < num_input_fields; i++) {
1736     CeedEvalMode eval_mode_i;
1737     CeedBasis    basis_i;
1738 
1739     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1740     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1741     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1742     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1743       CeedEvalMode eval_mode_j;
1744       CeedBasis    basis_j;
1745 
1746       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1747       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1748       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1749       if (basis_i == basis_j) {
1750         input_matrix_reuse[i].index     = j;
1751         input_matrix_reuse[i].is_input  = true;
1752         input_matrix_reuse[i].eval_mode = eval_mode_j;
1753       }
1754       CeedCallBackend(CeedBasisDestroy(&basis_j));
1755     }
1756     CeedCallBackend(CeedBasisDestroy(&basis_i));
1757   }
1758 
1759   // -- Determine output mat reuse
1760   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1761 
1762   for (CeedInt i = 0; i < num_output_fields; i++) {
1763     output_matrix_reuse[i].index = -1;
1764   }
1765   for (CeedInt i = 0; i < num_output_fields; i++) {
1766     CeedEvalMode eval_mode_i;
1767     CeedBasis    basis_i;
1768 
1769     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1770     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1771     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1772       CeedEvalMode eval_mode_j;
1773       CeedBasis    basis_j;
1774 
1775       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1776       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1777       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1778       if (basis_i == basis_j) {
1779         output_matrix_reuse[i].index     = j;
1780         output_matrix_reuse[i].is_input  = true;
1781         output_matrix_reuse[i].eval_mode = eval_mode_j;
1782       }
1783       CeedCallBackend(CeedBasisDestroy(&basis_j));
1784     }
1785     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1786       CeedEvalMode eval_mode_j;
1787       CeedBasis    basis_j;
1788 
1789       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1790       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1791       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1792       if (basis_i == basis_j) {
1793         output_matrix_reuse[i].index     = j;
1794         output_matrix_reuse[i].is_input  = false;
1795         output_matrix_reuse[i].eval_mode = eval_mode_j;
1796       }
1797       CeedCallBackend(CeedBasisDestroy(&basis_j));
1798     }
1799     CeedCallBackend(CeedBasisDestroy(&basis_i));
1800   }
1801 
1802   // Initialize constants, and matrices B and G
1803   code << "\n" << tab << "// Input field constants and basis data\n";
1804   for (CeedInt i = 0; i < num_input_fields; i++) {
1805     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1806                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1807   }
1808   code << "\n" << tab << "// Output field constants and basis data\n";
1809   for (CeedInt i = 0; i < num_output_fields; i++) {
1810     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1811                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1812   }
1813 
1814   // Loop over all elements
1815   code << "\n" << tab << "// Element loop\n";
1816   code << tab << "__syncthreads();\n";
1817   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1818   tab.push();
1819 
1820   // -- Compute minimum buffer space needed
1821   CeedInt max_rstr_buffer_size = 1;
1822 
1823   for (CeedInt i = 0; i < num_input_fields; i++) {
1824     CeedEvalMode eval_mode;
1825 
1826     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1827     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1828       CeedInt             num_comp;
1829       CeedElemRestriction elem_rstr;
1830 
1831       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1832       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1833       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1834       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1835     }
1836   }
1837   for (CeedInt i = 0; i < num_output_fields; i++) {
1838     CeedEvalMode eval_mode;
1839 
1840     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1841     if (eval_mode != CEED_EVAL_NONE) {
1842       CeedInt             num_comp;
1843       CeedElemRestriction elem_rstr;
1844 
1845       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1846       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1847       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1848       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1849     }
1850   }
1851   code << tab << "// Scratch restriction buffer space\n";
1852   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1853 
1854   // -- Determine best input field processing order
1855   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1856 
1857   for (CeedInt i = 0; i < num_input_fields; i++) {
1858     field_rstr_in_buffer[i] = -1;
1859     input_field_order[i]    = -1;
1860   }
1861   {
1862     bool    is_ordered[CEED_FIELD_MAX];
1863     CeedInt curr_index = 0;
1864 
1865     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1866     for (CeedInt i = 0; i < num_input_fields; i++) {
1867       CeedVector          vec_i;
1868       CeedElemRestriction rstr_i;
1869 
1870       if (is_ordered[i]) continue;
1871       field_rstr_in_buffer[i]       = i;
1872       is_ordered[i]                 = true;
1873       input_field_order[curr_index] = i;
1874       curr_index++;
1875       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1876       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1877       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1878       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1879         CeedVector          vec_j;
1880         CeedElemRestriction rstr_j;
1881 
1882         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1883         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1884         if (rstr_i == rstr_j && vec_i == vec_j) {
1885           field_rstr_in_buffer[j]       = i;
1886           is_ordered[j]                 = true;
1887           input_field_order[curr_index] = j;
1888           curr_index++;
1889         }
1890         CeedCallBackend(CeedVectorDestroy(&vec_j));
1891         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1892       }
1893       CeedCallBackend(CeedVectorDestroy(&vec_i));
1894       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1895     }
1896   }
1897 
1898   // -- Input restriction and basis
1899   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1900   CeedInt active_field_index = -1;
1901 
1902   for (CeedInt i = 0; i < num_input_fields; i++) {
1903     bool          is_active = false;
1904     const char   *field_name;
1905     const CeedInt f = input_field_order[i];
1906 
1907     {
1908       CeedVector vec;
1909 
1910       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1911       is_active = vec == CEED_VECTOR_ACTIVE;
1912       CeedCallBackend(CeedVectorDestroy(&vec));
1913     }
1914 
1915     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1916     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1917 
1918     if (is_active) {
1919       std::string var_suffix = "_in_" + std::to_string(f);
1920 
1921       code << tab << "// Active field - no restriction or basis action here\n";
1922       if (active_field_index == -1) {
1923         active_field_index = f;
1924         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
1925              << "] = {0.0};\n";
1926       } else {
1927         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
1928       }
1929     } else {
1930       // ---- Restriction
1931       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1932                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1933 
1934       // ---- Basis action
1935       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1936                                                            is_all_tensor, is_at_points, use_3d_slices));
1937     }
1938   }
1939 
1940   // -- Loop over active field
1941   std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
1942 
1943   code << "\n" << tab << "// Loop over nodes in active field\n";
1944   code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
1945        << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
1946   tab.push();
1947 
1948   // -- Set current active node and component to 1
1949   code << tab << "// Set current active node and component to 1.0\n";
1950   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
1951        << active_var_suffix << ");\n\n";
1952 
1953   for (CeedInt i = 0; i < num_input_fields; i++) {
1954     bool          is_active = false;
1955     const char   *field_name;
1956     const CeedInt f = input_field_order[i];
1957 
1958     {
1959       CeedVector vec;
1960 
1961       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1962       is_active = vec == CEED_VECTOR_ACTIVE;
1963       CeedCallBackend(CeedVectorDestroy(&vec));
1964     }
1965     if (!is_active) continue;
1966 
1967     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1968     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1969 
1970     // ---- Basis action
1971     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1972                                                          is_all_tensor, is_at_points, use_3d_slices));
1973   }
1974 
1975   // -- Q function
1976   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1977                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1978                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
1979 
1980   // -- Output basis and restriction
1981   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
1982   for (CeedInt i = 0; i < num_output_fields; i++) {
1983     bool        is_active = false;
1984     const char *field_name;
1985 
1986     {
1987       CeedVector vec;
1988 
1989       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
1990       is_active = vec == CEED_VECTOR_ACTIVE;
1991       CeedCallBackend(CeedVectorDestroy(&vec));
1992     }
1993     if (!is_active) continue;
1994 
1995     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1996     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1997 
1998     // ---- Basis action
1999     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
2000                                                          is_all_tensor, is_at_points, use_3d_slices));
2001 
2002     // ---- Restriction
2003     if (is_full) {
2004       // TODO: UPDATE OUTPUTS FOR FULL ASSEMBLY
2005     } else {
2006       std::string         var_suffix = "_out_" + std::to_string(i);
2007       CeedInt             comp_stride;
2008       CeedSize            l_size;
2009       CeedElemRestriction elem_rstr;
2010 
2011       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2012       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2013       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2014       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2015       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2016       code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2017            << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
2018       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2019     }
2020   }
2021 
2022   // -- Reset current active node and component
2023   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2024   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
2025        << active_var_suffix << ");\n";
2026 
2027   // -- End of loop over active field
2028   tab.pop();
2029   code << tab << "}\n";
2030 
2031   // Close loop and function
2032   tab.pop();
2033   code << tab << "}\n";
2034   tab.pop();
2035   code << tab << "}\n";
2036   code << tab << "// -----------------------------------------------------------------------------\n\n";
2037 
2038   CeedInt block_sizes[3] = {0, 0, 0};
2039   CeedInt num_elem;
2040 
2041   // Compile
2042   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2043   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2044   block_sizes[2] = 1;
2045   {
2046     bool is_compile_good = false;
2047 
2048     data->thread_1d = block_sizes[0];
2049     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good,
2050                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0],
2051                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2052     if (is_compile_good) {
2053       *is_good_build = true;
2054       CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
2055                                         is_full ? &data->assemble_full : &data->assemble_diagonal));
2056     } else {
2057       *is_good_build              = false;
2058       data->use_assembly_fallback = true;
2059     }
2060   }
2061   CeedCallBackend(CeedDestroy(&ceed));
2062   CeedCallBackend(CeedQFunctionDestroy(&qf));
2063   return CEED_ERROR_SUCCESS;
2064 }
2065 
2066 extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2067   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build);
2068 }
2069 
2070 //------------------------------------------------------------------------------
2071