xref: /libCEED/rust/libceed-sys/c-src/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision f1f13db40bc3ea1cde6ca54b4e70359a9e16f116)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/gen-tools.h>
13 #include <ceed/jit-tools.h>
14 
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 
19 #include "../hip-ref/ceed-hip-ref.h"
20 #include "../hip-shared/ceed-hip-shared.h"
21 #include "../hip/ceed-hip-common.h"
22 #include "../hip/ceed-hip-compile.h"
23 #include "ceed-hip-gen.h"
24 
25 struct FieldReuse_Hip {
26   CeedInt      index;
27   bool         is_input;
28   CeedEvalMode eval_mode;
29 };
30 
31 //------------------------------------------------------------------------------
32 // Calculate the block size used for launching the operator kernel
33 //------------------------------------------------------------------------------
34 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
35   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
36   if (dim == 1) {
37     CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
38 
39     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
40     block_sizes[0]  = thread_1d;
41     block_sizes[1]  = 1;
42     block_sizes[2]  = elems_per_block;
43   } else if (dim == 2) {
44     const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2;
45 
46     block_sizes[0] = thread_1d;
47     block_sizes[1] = thread_1d;
48     block_sizes[2] = elems_per_block;
49   } else if (dim == 3) {
50     const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1);
51 
52     block_sizes[0] = thread_1d;
53     block_sizes[1] = thread_1d;
54     block_sizes[2] = elems_per_block;
55   }
56   return CEED_ERROR_SUCCESS;
57 }
58 
59 //------------------------------------------------------------------------------
60 // Determine type of operator
61 //------------------------------------------------------------------------------
62 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
63                                                CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
64                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
65                                                CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
66   // Check if all are tensor
67   *is_all_tensor = true;
68   for (CeedInt i = 0; i < num_input_fields; i++) {
69     CeedBasis basis;
70 
71     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
72     if (basis != CEED_BASIS_NONE) {
73       bool is_field_tensor;
74 
75       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
76       *is_all_tensor = *is_all_tensor && is_field_tensor;
77     }
78     CeedCallBackend(CeedBasisDestroy(&basis));
79   }
80   for (CeedInt i = 0; i < num_output_fields; i++) {
81     CeedBasis basis;
82 
83     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
84     if (basis != CEED_BASIS_NONE) {
85       bool is_field_tensor;
86 
87       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
88       *is_all_tensor = *is_all_tensor && is_field_tensor;
89     }
90     CeedCallBackend(CeedBasisDestroy(&basis));
91   }
92 
93   // Find max_P, max_P_1d, Q, and Q_1d
94   bool is_all_3d = true;
95 
96   *max_P    = 0;
97   *max_P_1d = 0;
98   *Q        = 0;
99   *Q_1d     = 0;
100   for (CeedInt i = 0; i < num_input_fields; i++) {
101     CeedBasis basis;
102 
103     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
104     if (basis != CEED_BASIS_NONE) {
105       bool    is_field_tensor;
106       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
107 
108       // Check if 3D
109       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
110       is_all_3d = is_all_3d && (field_dim == 3);
111       *max_dim  = CeedIntMax(*max_dim, field_dim);
112 
113       // Collect P, P_1d, Q, and Q_1d
114       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
115       *max_P = CeedIntMax(*max_P, field_P);
116       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
117       if (is_field_tensor) {
118         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
119         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
120       }
121       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
122       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
123       *Q = field_Q;
124       if (is_field_tensor) {
125         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
126         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
127         *Q_1d = field_Q_1d;
128       }
129     }
130     CeedCallBackend(CeedBasisDestroy(&basis));
131   }
132   for (CeedInt i = 0; i < num_output_fields; i++) {
133     CeedBasis basis;
134 
135     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
136     if (basis != CEED_BASIS_NONE) {
137       bool    is_field_tensor;
138       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
139 
140       // Check if 3D
141       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
142       is_all_3d = is_all_3d && (field_dim == 3);
143       *max_dim  = CeedIntMax(*max_dim, field_dim);
144 
145       // Collect P, P_1d, Q, and Q_1d
146       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
147       *max_P = CeedIntMax(*max_P, field_P);
148       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
149       if (is_field_tensor) {
150         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
151         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
152       }
153       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
154       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
155       *Q = field_Q;
156       if (is_field_tensor) {
157         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
158         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
159         *Q_1d = field_Q_1d;
160       }
161     }
162     CeedCallBackend(CeedBasisDestroy(&basis));
163   }
164 
165   // Only use 3D collocated gradient parallelization strategy when gradient is computed
166   *use_3d_slices = false;
167   if (is_all_3d && *is_all_tensor) {
168     bool was_grad_found = false;
169 
170     for (CeedInt i = 0; i < num_input_fields; i++) {
171       CeedEvalMode eval_mode;
172 
173       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
174       if (eval_mode == CEED_EVAL_GRAD) {
175         CeedBasis_Hip_shared *basis_data;
176         CeedBasis             basis;
177 
178         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
179         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
180         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
181         was_grad_found = true;
182         CeedCallBackend(CeedBasisDestroy(&basis));
183       }
184     }
185     for (CeedInt i = 0; i < num_output_fields; i++) {
186       CeedEvalMode eval_mode;
187 
188       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
189       if (eval_mode == CEED_EVAL_GRAD) {
190         CeedBasis_Hip_shared *basis_data;
191         CeedBasis             basis;
192 
193         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
194         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
195         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
196         was_grad_found = true;
197         CeedCallBackend(CeedBasisDestroy(&basis));
198       }
199     }
200   }
201   return CEED_ERROR_SUCCESS;
202 }
203 
204 //------------------------------------------------------------------------------
205 // Setup fields
206 //------------------------------------------------------------------------------
207 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
208                                                     CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
209                                                     CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
210                                                     bool use_3d_slices) {
211   bool      is_tensor = true;
212   CeedBasis basis;
213   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
214   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
215 
216   const char           *field_name;
217   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
218   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
219   std::string           option_name = (is_input ? "inputs" : "outputs");
220   CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
221   CeedInt               elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
222   CeedElemRestriction   elem_rstr;
223   CeedBasis_Hip_shared *basis_data;
224 
225   // Field reuse info
226   bool use_previous_field = field_reuse.index != -1;
227 
228   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
229   code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
230 
231   // Get field data
232   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
233   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
234     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
235     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
236   }
237   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
238   if (basis != CEED_BASIS_NONE) {
239     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
240     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
241     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
242     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
243   }
244   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
245 
246   // Set field constants
247   code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
248   if (is_tensor && !is_all_tensor) {
249     CeedInt P = 0;
250 
251     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
252     code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
253   }
254   code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
255   if (eval_mode != CEED_EVAL_WEIGHT) {
256     code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
257   }
258 
259   // Load basis data
260   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
261   switch (eval_mode) {
262     case CEED_EVAL_NONE:
263       break;
264     case CEED_EVAL_INTERP:
265       if (is_at_points) {
266         // AtPoints
267         if (!basis_data->d_chebyshev_interp_1d) {
268           CeedSize    interp_bytes;
269           CeedScalar *chebyshev_interp_1d;
270 
271           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
272           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
273           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
274           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
275           CeedCallHip(CeedBasisReturnCeed(basis),
276                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
277           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
278         }
279         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
280         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
281       } else {
282         // Standard quadrature
283         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
284         else data->B.outputs[i] = basis_data->d_interp_1d;
285       }
286       if (use_previous_field) {
287         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
288 
289         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
290       } else {
291         code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
292         code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
293       }
294       break;
295     case CEED_EVAL_GRAD:
296       if (is_at_points) {
297         // AtPoints
298         if (!basis_data->d_chebyshev_interp_1d) {
299           CeedSize    interp_bytes;
300           CeedScalar *chebyshev_interp_1d;
301 
302           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
303           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
304           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
305           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
306           CeedCallHip(CeedBasisReturnCeed(basis),
307                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
308           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
309         }
310         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
311         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
312       } else {
313         // Standard quadrature
314         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
315         else data->B.outputs[i] = basis_data->d_interp_1d;
316       }
317       if (is_tensor) {
318         if (use_previous_field) {
319           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
320 
321           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
322         } else {
323           code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
324           code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
325         }
326       }
327       if (is_at_points) break;  // No G mat for AtPoints
328       if (use_3d_slices) {
329         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
330         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
331         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
332           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
333 
334           code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
335         } else {
336           code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
337           code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
338         }
339       } else {
340         bool has_collo_grad = basis_data->d_collo_grad_1d;
341 
342         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
343         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
344         if (has_collo_grad) {
345           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
346             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
347 
348             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
349           } else {
350             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
351             code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
352           }
353         } else {
354           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
355             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
356 
357             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
358           } else {
359             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
360                  << (is_tensor ? "" : var_suffix) << "];\n";
361             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
362                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
363           }
364         }
365       }
366       break;
367     case CEED_EVAL_WEIGHT:
368       break;  // No action
369       // LCOV_EXCL_START
370     case CEED_EVAL_DIV:
371     case CEED_EVAL_CURL:
372       break;  // TODO: Not implemented
373               // LCOV_EXCL_STOP
374   }
375   CeedCallBackend(CeedBasisDestroy(&basis));
376   return CEED_ERROR_SUCCESS;
377 }
378 
379 //------------------------------------------------------------------------------
380 // Restriction
381 //------------------------------------------------------------------------------
382 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
383                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
384                                                       CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
385                                                       bool use_3d_slices) {
386   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
387   std::string              P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
388   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
389   CeedInt                  elem_size = 0, num_comp = 0;
390   CeedSize                 l_size;
391   CeedRestrictionType      rstr_type = CEED_RESTRICTION_STANDARD;
392   CeedElemRestriction_Hip *rstr_data;
393   CeedElemRestriction      elem_rstr;
394 
395   // Get field data
396   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
397   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
398     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
399     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
400     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
401     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
402   }
403   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
404 
405   // Restriction
406   if (is_input) {
407     // Input
408     if (field_input_buffer[i] != i) {
409       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
410 
411       // Restriction was already done for previous input
412       code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
413     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
414       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
415         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
416         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
417       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
418         // Otherwise we're using the scratch space
419         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
420       }
421       switch (rstr_type) {
422         case CEED_RESTRICTION_STANDARD: {
423           CeedInt comp_stride;
424 
425           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
426           code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
427           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
428           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
429           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
430           code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
431                << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
432                << ");\n";
433           break;
434         }
435         case CEED_RESTRICTION_STRIDED: {
436           bool    has_backend_strides;
437           CeedInt num_elem;
438 
439           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
440           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
441           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
442 
443           if (!has_backend_strides) {
444             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
445           }
446           code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
447                << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
448           code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
449                << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
450                << var_suffix << ");\n";
451           break;
452         }
453         case CEED_RESTRICTION_POINTS: {
454           CeedInt comp_stride;
455 
456           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
457           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
458           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
459           break;
460         }
461         // LCOV_EXCL_START
462         case CEED_RESTRICTION_ORIENTED:
463         case CEED_RESTRICTION_CURL_ORIENTED:
464           break;  // TODO: Not implemented
465                   // LCOV_EXCL_STOP
466       }
467     }
468   } else {
469     // Output
470     switch (rstr_type) {
471       case CEED_RESTRICTION_STANDARD: {
472         CeedInt comp_stride;
473 
474         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
475         code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
476         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
477         code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
478         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
479         code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
480              << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
481              << ");\n";
482         break;
483       }
484       case CEED_RESTRICTION_STRIDED: {
485         bool    has_backend_strides;
486         CeedInt num_elem;
487 
488         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
489         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
490         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
491 
492         if (!has_backend_strides) {
493           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
494         }
495         code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
496              << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
497         code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
498              << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
499              << ");\n";
500         break;
501       }
502       case CEED_RESTRICTION_POINTS:
503         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
504         break;
505       // LCOV_EXCL_START
506       case CEED_RESTRICTION_ORIENTED:
507       case CEED_RESTRICTION_CURL_ORIENTED:
508         break;  // TODO: Not implemented
509                 // LCOV_EXCL_STOP
510     }
511   }
512   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
513   return CEED_ERROR_SUCCESS;
514 }
515 
516 //------------------------------------------------------------------------------
517 // Basis
518 //------------------------------------------------------------------------------
519 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field,
520                                                 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
521                                                 bool is_at_points, bool use_3d_slices) {
522   bool      is_tensor = true;
523   CeedBasis basis;
524   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
525   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
526 
527   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
528   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
529   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
530   CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
531   CeedElemRestriction elem_rstr;
532 
533   // Get field data
534   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
535   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
536     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
537     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
538   }
539   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
540   if (basis != CEED_BASIS_NONE) {
541     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
542     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
543     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
544   }
545   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
546 
547   // Basis
548   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
549   if (is_input) {
550     switch (eval_mode) {
551       case CEED_EVAL_NONE:
552         if (!use_3d_slices && !is_at_points) {
553           code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
554         }
555         break;
556       case CEED_EVAL_INTERP:
557         if (is_at_points) {
558           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
559 
560           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
561           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
562                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
563         } else {
564           std::string function_name = is_tensor
565                                           ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
566                                           : "InterpNonTensor";
567           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
568 
569           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
570           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
571                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
572         }
573         break;
574       case CEED_EVAL_GRAD:
575         if (is_at_points) {
576           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
577 
578           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
579           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
580                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
581         } else if (use_3d_slices) {
582           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
583 
584           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
585           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
586                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
587         } else if (is_tensor) {
588           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
589           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
590                                       (is_all_tensor ? "" : "Flattened");
591           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
592 
593           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
594                << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
595           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
596                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
597         } else {
598           std::string function_name = "GradNonTensor";
599 
600           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
601           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
602                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
603         }
604         break;
605       case CEED_EVAL_WEIGHT: {
606         if (is_at_points) {
607           code << tab << "// Nothing to do AtPoints\n";
608         } else {
609           CeedBasis_Hip_shared *basis_data;
610           std::string           function_name = is_tensor
611                                                     ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
612                                                     : "WeightNonTensor";
613 
614           code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
615           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
616           data->W = basis_data->d_q_weight_1d;
617           code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
618         }
619         break;
620       }
621       // LCOV_EXCL_START
622       case CEED_EVAL_DIV:
623       case CEED_EVAL_CURL:
624         break;  // TODO: Not implemented
625                 // LCOV_EXCL_STOP
626     }
627   } else {
628     switch (eval_mode) {
629       case CEED_EVAL_NONE:
630         code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
631         break;  // No action
632       case CEED_EVAL_INTERP:
633         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
634         if (is_at_points) {
635           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
636 
637           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
638                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
639         } else {
640           std::string function_name =
641               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
642                         : "InterpTransposeNonTensor";
643           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
644 
645           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
646                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
647         }
648         break;
649       case CEED_EVAL_GRAD:
650         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
651         if (is_at_points) {
652           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
653 
654           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
655                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
656         } else if (use_3d_slices) {
657           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
658 
659           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
660                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
661         } else if (is_tensor) {
662           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
663           std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
664                                       std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
665           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
666 
667           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
668                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
669         } else {
670           std::string function_name = "GradTransposeNonTensor";
671 
672           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
673                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
674         }
675         break;
676       // LCOV_EXCL_START
677       case CEED_EVAL_WEIGHT:
678         break;  // Should not occur
679       case CEED_EVAL_DIV:
680       case CEED_EVAL_CURL:
681         break;  // TODO: Not implemented
682                 // LCOV_EXCL_STOP
683     }
684   }
685   CeedCallBackend(CeedBasisDestroy(&basis));
686   return CEED_ERROR_SUCCESS;
687 }
688 
689 //------------------------------------------------------------------------------
690 // QFunction
691 //------------------------------------------------------------------------------
692 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim,
693                                                     CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
694                                                     CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
695                                                     CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
696                                                     std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
697                                                     bool use_3d_slices) {
698   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
699   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
700   CeedElemRestriction elem_rstr;
701 
702   // Setup output arrays
703   code << "\n";
704   code << tab << "// -- Output field setup\n";
705   for (CeedInt i = 0; i < num_output_fields; i++) {
706     const char *field_name;
707     std::string var_suffix = "_out_" + std::to_string(i);
708 
709     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
710     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
711     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
712     switch (eval_mode) {
713       case CEED_EVAL_NONE:
714         if (is_at_points) {
715           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
716         } else {
717           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
718                << "];\n";
719         }
720         break;
721       case CEED_EVAL_INTERP:
722         if (is_at_points) {
723           // Accumulator for point data
724           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
725           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
726                << "[i] = 0.0;\n";
727         } else {
728           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
729                << "];\n";
730         }
731         break;
732       case CEED_EVAL_GRAD:
733         if (is_at_points) {
734           // Accumulator for point data
735           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
736           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
737                << "[i] = 0.0;\n";
738         } else if (use_3d_slices) {
739           // Accumulator for gradient slices
740           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
741           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
742         } else {
743           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
744                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
745         }
746         break;
747       case CEED_EVAL_WEIGHT:
748         break;
749         // LCOV_EXCL_START
750       case CEED_EVAL_DIV:
751       case CEED_EVAL_CURL:
752         break;  // TODO: Not implemented
753                 // LCOV_EXCL_STOP
754     }
755   }
756 
757   if (is_at_points) {
758     // We need to handle batches of points
759     code << "\n";
760     code << tab << "// Note: Using batches of points\n";
761     code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
762     code << tab << "#pragma unroll\n";
763     code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
764     tab.push();
765     code << tab << "const CeedInt p = i % max_num_points;\n\n";
766 
767     code << tab << "// -- Coordinates\n";
768     code << tab << "CeedScalar r_x[max_dim];\n";
769     code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
770 
771     code << tab << "// -- Input fields\n";
772     for (CeedInt i = 0; i < num_input_fields; i++) {
773       const char *field_name;
774       std::string var_suffix = "_in_" + std::to_string(i);
775       std::string P_name     = "P_1d" + var_suffix;
776 
777       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
778       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
779       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
780       // Basis action
781       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
782       switch (eval_mode) {
783         case CEED_EVAL_NONE:
784           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
785           code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
786                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
787           break;
788         case CEED_EVAL_INTERP:
789           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
790           code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
791                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
792           break;
793         case CEED_EVAL_GRAD:
794           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
795           code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
796                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
797           break;
798         case CEED_EVAL_WEIGHT:
799           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
800           code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
801           break;
802           // LCOV_EXCL_START
803         case CEED_EVAL_DIV:
804         case CEED_EVAL_CURL:
805           break;  // TODO: Not implemented
806                   // LCOV_EXCL_STOP
807       }
808     }
809     code << "\n";
810     code << tab << "// -- Output fields\n";
811     for (CeedInt i = 0; i < num_output_fields; i++) {
812       const char *field_name;
813       std::string var_suffix = "_out_" + std::to_string(i);
814 
815       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
816       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
817       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
818       // Basis action
819       switch (eval_mode) {
820         case CEED_EVAL_NONE:
821           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
822           break;
823         case CEED_EVAL_INTERP:
824           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
825           break;
826         case CEED_EVAL_GRAD:
827           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
828           break;
829           // LCOV_EXCL_START
830         case CEED_EVAL_WEIGHT:
831           break;  // Should not occur
832         case CEED_EVAL_DIV:
833         case CEED_EVAL_CURL:
834           break;  // TODO: Not implemented
835                   // LCOV_EXCL_STOP
836       }
837     }
838 
839   } else if (use_3d_slices) {
840     // We treat quadrature points per slice in 3d to save registers
841     code << "\n";
842     code << tab << "// Note: Using planes of 3D elements\n";
843     code << tab << "#pragma unroll\n";
844     code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
845     tab.push();
846     code << tab << "// -- Input fields\n";
847     for (CeedInt i = 0; i < num_input_fields; i++) {
848       const char *field_name;
849       std::string var_suffix = "_in_" + std::to_string(i);
850 
851       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
852       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
853       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
854       // Basis action
855       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
856       switch (eval_mode) {
857         case CEED_EVAL_NONE:
858           bool is_strided;
859 
860           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
861 
862           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
863           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
864           if (is_strided) {
865             bool    has_backend_strides;
866             CeedInt num_elem, elem_size;
867 
868             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
869             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
870             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
871             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
872 
873             if (!has_backend_strides) {
874               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
875             }
876             code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
877                  << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
878             code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
879                  << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
880           } else {
881             CeedSize                 l_size = 0;
882             CeedInt                  comp_stride;
883             CeedElemRestriction_Hip *rstr_data;
884 
885             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
886             code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
887             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
888             code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
889             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
890             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
891             code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
892                  << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
893           }
894           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
895           break;
896         case CEED_EVAL_INTERP:
897           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
898           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
899           tab.push();
900           code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
901           tab.pop();
902           code << tab << "}\n";
903           break;
904         case CEED_EVAL_GRAD:
905           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
906           code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
907                << var_suffix << ", r_s" << var_suffix << ");\n";
908           break;
909         case CEED_EVAL_WEIGHT:
910           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
911           code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
912           break;
913           // LCOV_EXCL_START
914         case CEED_EVAL_DIV:
915         case CEED_EVAL_CURL:
916           break;  // TODO: Not implemented
917                   // LCOV_EXCL_STOP
918       }
919     }
920     code << "\n";
921     code << tab << "// -- Output fields\n";
922     for (CeedInt i = 0; i < num_output_fields; i++) {
923       const char *field_name;
924       std::string var_suffix = "_out_" + std::to_string(i);
925 
926       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
927       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
928       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
929       // Basis action
930       switch (eval_mode) {
931         case CEED_EVAL_NONE:
932           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
933           break;
934         case CEED_EVAL_INTERP:
935           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
936           break;
937         case CEED_EVAL_GRAD:
938           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
939           break;
940           // LCOV_EXCL_START
941         case CEED_EVAL_WEIGHT:
942           break;  // Should not occur
943         case CEED_EVAL_DIV:
944         case CEED_EVAL_CURL:
945           break;  // TODO: Not implemented
946                   // LCOV_EXCL_STOP
947       }
948     }
949   } else {
950     code << "\n";
951     code << tab << "// Note: Using full elements\n";
952     code << tab << "{\n";
953     tab.push();
954     code << tab << "// -- Input fields\n";
955     for (CeedInt i = 0; i < num_input_fields; i++) {
956       const char *field_name;
957 
958       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
959       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
960       code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
961     }
962     code << tab << "// -- Output fields\n";
963     for (CeedInt i = 0; i < num_output_fields; i++) {
964       const char *field_name;
965 
966       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
967       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
968       code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
969     }
970   }
971 
972   // Input and output buffers
973   code << "\n";
974   code << tab << "// -- QFunction inputs and outputs\n";
975   code << tab << "// ---- Inputs\n";
976   code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
977   for (CeedInt i = 0; i < num_input_fields; i++) {
978     const char *field_name;
979 
980     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
981     code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
982     code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
983   }
984   code << tab << "// ---- Outputs\n";
985   code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
986   for (CeedInt i = 0; i < num_output_fields; i++) {
987     const char *field_name;
988 
989     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
990     code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
991     code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
992   }
993 
994   // Apply QFunction
995   code << "\n";
996   code << tab << "// -- Apply QFunction\n";
997   code << tab << "" << qfunction_name << "(ctx, ";
998   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
999     code << "1";
1000   } else {
1001     code << Q_name;
1002   }
1003   code << ", inputs, outputs);\n";
1004 
1005   if (is_at_points) {
1006     // Map back to coefficients
1007     code << "\n";
1008     code << tab << "// -- Output fields\n";
1009     for (CeedInt i = 0; i < num_output_fields; i++) {
1010       const char *field_name;
1011       std::string var_suffix = "_out_" + std::to_string(i);
1012       std::string P_name     = "P_1d" + var_suffix;
1013 
1014       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1015       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1016       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1017       // Basis action
1018       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1019       switch (eval_mode) {
1020         case CEED_EVAL_NONE: {
1021           CeedInt             comp_stride;
1022           CeedElemRestriction elem_rstr;
1023 
1024           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1025           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
1026           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1027           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
1028           code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
1029                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
1030                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
1031           break;
1032         }
1033         case CEED_EVAL_INTERP:
1034           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1035           tab.push();
1036           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1037           tab.pop();
1038           code << tab << "}\n";
1039           code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1040                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1041           break;
1042         case CEED_EVAL_GRAD:
1043           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1044           tab.push();
1045           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1046           tab.pop();
1047           code << tab << "}\n";
1048           code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1049                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1050           break;
1051           // LCOV_EXCL_START
1052         case CEED_EVAL_WEIGHT:
1053           break;  // Should not occur
1054         case CEED_EVAL_DIV:
1055         case CEED_EVAL_CURL:
1056           break;  // TODO: Not implemented
1057                   // LCOV_EXCL_STOP
1058       }
1059     }
1060   } else if (use_3d_slices) {
1061     // Copy or apply transpose grad, if needed
1062     code << "\n";
1063     code << tab << "// -- Output fields\n";
1064     for (CeedInt i = 0; i < num_output_fields; i++) {
1065       const char *field_name;
1066       std::string var_suffix = "_out_" + std::to_string(i);
1067       std::string P_name     = "P_1d" + var_suffix;
1068 
1069       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1070       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1071       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1072       // Basis action
1073       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1074       switch (eval_mode) {
1075         case CEED_EVAL_NONE:
1076           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1077           tab.push();
1078           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1079           tab.pop();
1080           code << tab << "}\n";
1081           break;
1082         case CEED_EVAL_INTERP:
1083           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1084           tab.push();
1085           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1086           tab.pop();
1087           code << tab << "}\n";
1088           break;
1089         case CEED_EVAL_GRAD:
1090           code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1091                << var_suffix << ", r_q" << var_suffix << ");\n";
1092           break;
1093           // LCOV_EXCL_START
1094         case CEED_EVAL_WEIGHT:
1095           break;  // Should not occur
1096         case CEED_EVAL_DIV:
1097         case CEED_EVAL_CURL:
1098           break;  // TODO: Not implemented
1099                   // LCOV_EXCL_STOP
1100       }
1101     }
1102   }
1103   tab.pop();
1104   code << tab << "}\n";
1105   return CEED_ERROR_SUCCESS;
1106 }
1107 
1108 //------------------------------------------------------------------------------
1109 // Build single operator kernel
1110 //------------------------------------------------------------------------------
1111 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
1112   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
1113   Ceed                   ceed;
1114   CeedInt                Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1115   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1116   CeedQFunction_Hip_gen *qf_data;
1117   CeedQFunction          qf;
1118   CeedOperatorField     *op_input_fields, *op_output_fields;
1119   CeedOperator_Hip_gen  *data;
1120   std::ostringstream     code;
1121   Tab                    tab;
1122 
1123   CeedCallBackend(CeedOperatorGetData(op, &data));
1124   {
1125     bool is_setup_done;
1126 
1127     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
1128     if (is_setup_done) {
1129       *is_good_build = !data->use_fallback;
1130       return CEED_ERROR_SUCCESS;
1131     }
1132   }
1133 
1134   // Check field compatibility
1135   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1136   {
1137     bool has_shared_bases = true;
1138 
1139     for (CeedInt i = 0; i < num_input_fields; i++) {
1140       CeedBasis basis;
1141 
1142       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1143       if (basis != CEED_BASIS_NONE) {
1144         bool        is_tensor = true;
1145         const char *resource;
1146         char       *resource_root;
1147         Ceed        basis_ceed;
1148 
1149         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1150         is_all_tensor    = is_all_tensor && is_tensor;
1151         is_all_nontensor = is_all_nontensor && !is_tensor;
1152         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1153         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1154         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1155         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1156         CeedCallBackend(CeedFree(&resource_root));
1157         CeedCallBackend(CeedDestroy(&basis_ceed));
1158       }
1159       CeedCallBackend(CeedBasisDestroy(&basis));
1160     }
1161 
1162     for (CeedInt i = 0; i < num_output_fields; i++) {
1163       CeedBasis basis;
1164 
1165       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1166       if (basis != CEED_BASIS_NONE) {
1167         bool        is_tensor = true;
1168         const char *resource;
1169         char       *resource_root;
1170         Ceed        basis_ceed;
1171 
1172         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1173         is_all_tensor    = is_all_tensor && is_tensor;
1174         is_all_nontensor = is_all_nontensor && !is_tensor;
1175 
1176         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1177         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1178         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1179         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
1180         CeedCallBackend(CeedFree(&resource_root));
1181         CeedCallBackend(CeedDestroy(&basis_ceed));
1182       }
1183       CeedCallBackend(CeedBasisDestroy(&basis));
1184     }
1185     // -- Fallback to ref if not all bases are shared
1186     if (!has_shared_bases) {
1187       *is_good_build = false;
1188       return CEED_ERROR_SUCCESS;
1189     }
1190   }
1191   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1192   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1193   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1194   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1195 
1196   // Get operator data
1197   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1198   {
1199     CeedInt max_P = 0, max_P_1d = 0;
1200 
1201     CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
1202                                                         qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));
1203     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
1204   }
1205   if (max_dim == 0) max_dim = 1;
1206   data->dim = max_dim;
1207   if (is_at_points) {
1208     CeedElemRestriction_Hip *rstr_data;
1209     CeedElemRestriction      rstr_points = NULL;
1210 
1211     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1212     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1213     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1214     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1215     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1216     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1217   }
1218   if (is_at_points) use_3d_slices = false;
1219   if (Q_1d == 0) {
1220     if (is_at_points) Q_1d = max_num_points;
1221     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1222   }
1223   if (Q == 0) Q = Q_1d;
1224   data->Q    = Q;
1225   data->Q_1d = Q_1d;
1226 
1227   // Check for restriction only identity operator
1228   {
1229     bool is_identity_qf;
1230 
1231     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1232     if (is_identity_qf) {
1233       CeedEvalMode eval_mode_in, eval_mode_out;
1234 
1235       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1236       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1237       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1238                 "Backend does not implement restriction only identity operators");
1239     }
1240   }
1241 
1242   // Load basis source files
1243   if (!is_all_nontensor) {
1244     code << tab << "// Tensor basis source\n";
1245     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1246   }
1247   if (!is_all_tensor) {
1248     code << tab << "// Non-tensor basis source\n";
1249     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
1250   }
1251   if (is_at_points) {
1252     code << tab << "// AtPoints basis source\n";
1253     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1254   }
1255   if (!is_all_tensor && !is_all_nontensor) {
1256     code << tab << "// Tensor basis source\n";
1257     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
1258   }
1259   code << tab << "// CodeGen operator source\n";
1260   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1261 
1262   // Get QFunction name
1263   std::string qfunction_name(qf_data->qfunction_name);
1264   std::string operator_name;
1265 
1266   operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
1267 
1268   // Define CEED_Q_VLA
1269   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1270   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1271     code << tab << "#define CEED_Q_VLA 1\n\n";
1272   } else {
1273     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1274   }
1275 
1276   // Add user QFunction source
1277   {
1278     const char *source_path;
1279 
1280     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1281     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1282 
1283     code << tab << "// User QFunction source\n";
1284     code << tab << "#include \"" << source_path << "\"\n\n";
1285   }
1286 
1287   // Setup
1288   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1289   code << tab << "// Operator Kernel\n";
1290   code << tab << "// \n";
1291   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1292   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1293   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1294   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1295   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1296   code << tab << "// \n";
1297   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1298   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1299   code << tab << "// -----------------------------------------------------------------------------\n";
1300   code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
1301   code << "__global__ void " << operator_name
1302        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
1303   tab.push();
1304 
1305   // Scratch buffers
1306   for (CeedInt i = 0; i < num_input_fields; i++) {
1307     CeedEvalMode eval_mode;
1308 
1309     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1310     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1311       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1312     }
1313   }
1314   for (CeedInt i = 0; i < num_output_fields; i++) {
1315     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1316   }
1317 
1318   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1319   if (!is_all_tensor) {
1320     code << tab << "const CeedInt Q = " << Q << ";\n";
1321   }
1322   if (!is_all_nontensor) {
1323     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1324   }
1325   if (is_at_points) {
1326     code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1327     code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1328   }
1329 
1330   // Shared data
1331   code << tab << "extern __shared__ CeedScalar slice[];\n";
1332   code << tab << "SharedData_Hip data;\n";
1333   code << tab << "data.t_id_x = threadIdx.x;\n";
1334   code << tab << "data.t_id_y = threadIdx.y;\n";
1335   code << tab << "data.t_id_z = threadIdx.z;\n";
1336   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1337   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1338 
1339   // -- Determine input mat reuse
1340   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1341 
1342   for (CeedInt i = 0; i < num_input_fields; i++) {
1343     input_matrix_reuse[i].index = -1;
1344   }
1345   for (CeedInt i = 0; i < num_input_fields; i++) {
1346     bool         is_tensor = true;
1347     CeedEvalMode eval_mode_i;
1348     CeedBasis    basis_i;
1349 
1350     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1351     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1352     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1353     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1354     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1355       CeedEvalMode eval_mode_j;
1356       CeedBasis    basis_j;
1357 
1358       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1359       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1360       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1361       if (basis_i == basis_j) {
1362         if (is_tensor) {
1363           input_matrix_reuse[i].index     = j;
1364           input_matrix_reuse[i].is_input  = true;
1365           input_matrix_reuse[i].eval_mode = eval_mode_j;
1366         } else {
1367           // For non-tensor can only re-use with the same eval mode
1368           if (eval_mode_i == eval_mode_j) {
1369             input_matrix_reuse[i].index     = j;
1370             input_matrix_reuse[i].is_input  = true;
1371             input_matrix_reuse[i].eval_mode = eval_mode_j;
1372           }
1373         }
1374       }
1375       CeedCallBackend(CeedBasisDestroy(&basis_j));
1376     }
1377     CeedCallBackend(CeedBasisDestroy(&basis_i));
1378   }
1379 
1380   // -- Determine output mat reuse
1381   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1382 
1383   for (CeedInt i = 0; i < num_output_fields; i++) {
1384     output_matrix_reuse[i].index = -1;
1385   }
1386   for (CeedInt i = 0; i < num_output_fields; i++) {
1387     bool         is_tensor = true;
1388     CeedEvalMode eval_mode_i;
1389     CeedBasis    basis_i;
1390 
1391     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1392     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1393     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1394       CeedEvalMode eval_mode_j;
1395       CeedBasis    basis_j;
1396 
1397       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1398       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1399       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1400       if (basis_i == basis_j) {
1401         if (is_tensor) {
1402           output_matrix_reuse[i].index     = j;
1403           output_matrix_reuse[i].is_input  = true;
1404           output_matrix_reuse[i].eval_mode = eval_mode_j;
1405         } else {
1406           // For non-tensor can only re-use with the same eval mode
1407           if (eval_mode_i == eval_mode_j) {
1408             output_matrix_reuse[i].index     = j;
1409             output_matrix_reuse[i].is_input  = true;
1410             output_matrix_reuse[i].eval_mode = eval_mode_j;
1411           }
1412         }
1413       }
1414       CeedCallBackend(CeedBasisDestroy(&basis_j));
1415     }
1416     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1417       CeedEvalMode eval_mode_j;
1418       CeedBasis    basis_j;
1419 
1420       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1421       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1422       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1423       CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1424       if (basis_i == basis_j) {
1425         if (is_tensor) {
1426           output_matrix_reuse[i].index     = j;
1427           output_matrix_reuse[i].is_input  = false;
1428           output_matrix_reuse[i].eval_mode = eval_mode_j;
1429         } else {
1430           // For non-tensor can only re-use with the same eval mode
1431           if (eval_mode_i == eval_mode_j) {
1432             output_matrix_reuse[i].index     = j;
1433             output_matrix_reuse[i].is_input  = false;
1434             output_matrix_reuse[i].eval_mode = eval_mode_j;
1435           }
1436         }
1437       }
1438       CeedCallBackend(CeedBasisDestroy(&basis_j));
1439     }
1440     CeedCallBackend(CeedBasisDestroy(&basis_i));
1441   }
1442 
1443   // Initialize constants, and matrices B and G
1444   code << "\n" << tab << "// Input field constants and basis data\n";
1445   for (CeedInt i = 0; i < num_input_fields; i++) {
1446     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1447                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1448   }
1449   code << "\n" << tab << "// Output field constants and basis data\n";
1450   for (CeedInt i = 0; i < num_output_fields; i++) {
1451     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1452                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1453   }
1454 
1455   // Loop over all elements
1456   code << "\n" << tab << "// Element loop\n";
1457   code << tab << "__syncthreads();\n";
1458   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1459   tab.push();
1460 
1461   // -- Compute minimum buffer space needed
1462   CeedInt max_rstr_buffer_size = 1;
1463 
1464   for (CeedInt i = 0; i < num_input_fields; i++) {
1465     CeedEvalMode eval_mode;
1466 
1467     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1468     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1469       CeedInt             num_comp;
1470       CeedElemRestriction elem_rstr;
1471 
1472       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1473       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1474       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1475       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1476     }
1477   }
1478   for (CeedInt i = 0; i < num_output_fields; i++) {
1479     CeedEvalMode eval_mode;
1480 
1481     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1482     if (eval_mode != CEED_EVAL_NONE) {
1483       CeedInt             num_comp;
1484       CeedElemRestriction elem_rstr;
1485 
1486       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1487       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1488       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1489       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1490     }
1491   }
1492   code << tab << "// Scratch restriction buffer space\n";
1493   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1494 
1495   // -- Determine best input field processing order
1496   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1497 
1498   for (CeedInt i = 0; i < num_input_fields; i++) {
1499     field_rstr_in_buffer[i] = -1;
1500     input_field_order[i]    = -1;
1501   }
1502   {
1503     bool    is_ordered[CEED_FIELD_MAX];
1504     CeedInt curr_index = 0;
1505 
1506     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1507     for (CeedInt i = 0; i < num_input_fields; i++) {
1508       CeedVector          vec_i;
1509       CeedElemRestriction rstr_i;
1510 
1511       if (is_ordered[i]) continue;
1512       field_rstr_in_buffer[i]       = i;
1513       is_ordered[i]                 = true;
1514       input_field_order[curr_index] = i;
1515       curr_index++;
1516       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1517       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1518       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1519       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1520         CeedVector          vec_j;
1521         CeedElemRestriction rstr_j;
1522 
1523         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1524         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1525         if (rstr_i == rstr_j && vec_i == vec_j) {
1526           field_rstr_in_buffer[j]       = i;
1527           is_ordered[j]                 = true;
1528           input_field_order[curr_index] = j;
1529           curr_index++;
1530         }
1531         CeedCallBackend(CeedVectorDestroy(&vec_j));
1532         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1533       }
1534       CeedCallBackend(CeedVectorDestroy(&vec_i));
1535       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1536     }
1537   }
1538 
1539   // -- Input restriction and basis
1540   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1541   for (CeedInt i = 0; i < num_input_fields; i++) {
1542     const char   *field_name;
1543     const CeedInt f = input_field_order[i];
1544 
1545     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1546     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1547 
1548     // ---- Restriction
1549     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1550                                                                max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1551 
1552     // ---- Basis action
1553     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1554                                                          is_all_tensor, is_at_points, use_3d_slices));
1555   }
1556 
1557   // -- Q function
1558   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1559                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1560                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
1561 
1562   // -- Output basis and restriction
1563   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
1564   for (CeedInt i = 0; i < num_output_fields; i++) {
1565     const char *field_name;
1566 
1567     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1568     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1569 
1570     // ---- Basis action
1571     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
1572                                                          is_all_tensor, is_at_points, use_3d_slices));
1573 
1574     // ---- Restriction
1575     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
1576                                                                false, is_all_tensor, is_at_points, use_3d_slices));
1577   }
1578 
1579   // Close loop and function
1580   tab.pop();
1581   code << tab << "}\n";
1582   tab.pop();
1583   code << tab << "}\n";
1584   code << tab << "// -----------------------------------------------------------------------------\n\n";
1585 
1586   CeedInt block_sizes[3] = {0, 0, 0};
1587   CeedInt num_elem;
1588 
1589   // Compile
1590   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
1591   CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
1592   {
1593     bool is_compile_good = false;
1594 
1595     data->thread_1d = block_sizes[0];
1596     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE",
1597                                        block_sizes[0] * block_sizes[1] * block_sizes[2]));
1598     if (is_compile_good) {
1599       *is_good_build = true;
1600       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
1601     } else {
1602       *is_good_build     = false;
1603       data->use_fallback = true;
1604     }
1605   }
1606   CeedCallBackend(CeedOperatorSetSetupDone(op));
1607   CeedCallBackend(CeedDestroy(&ceed));
1608   CeedCallBackend(CeedQFunctionDestroy(&qf));
1609   return CEED_ERROR_SUCCESS;
1610 }
1611 
1612 //------------------------------------------------------------------------------
1613 // Build AtPoints assembly operator kernel
1614 //------------------------------------------------------------------------------
1615 static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) {
1616   bool                   is_all_tensor = true, is_at_points = false, use_3d_slices = false;
1617   Ceed                   ceed;
1618   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1619   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1620   CeedQFunction_Hip_gen *qf_data;
1621   CeedQFunction          qf;
1622   CeedOperatorField     *op_input_fields, *op_output_fields;
1623   CeedOperator_Hip_gen  *data;
1624   std::ostringstream     code;
1625   Tab                    tab;
1626 
1627   // Check compatibility
1628   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1629   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1630   CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
1631 
1632   // Retrieve operator data
1633   CeedCallBackend(CeedOperatorGetData(op, &data));
1634   Q       = data->Q;
1635   Q_1d    = data->Q_1d;
1636   max_dim = data->dim;
1637   {
1638     CeedElemRestriction rstr_points = NULL;
1639 
1640     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1641     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1642     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1643     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1644   }
1645   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1646   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1647   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1648   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1649 
1650   // Load basis source files
1651   code << tab << "// Tensor basis source\n";
1652   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1653   code << tab << "// AtPoints basis source\n";
1654   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1655   code << tab << "// CodeGen operator source\n";
1656   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1657 
1658   // Get QFunction name
1659   std::string qfunction_name(qf_data->qfunction_name);
1660   std::string operator_name;
1661 
1662   if (is_full) {
1663     operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name;
1664   } else {
1665     operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name;
1666   }
1667 
1668   // Define CEED_Q_VLA
1669   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1670   code << tab << "#define CEED_Q_VLA 1\n\n";
1671 
1672   // Add user QFunction source
1673   {
1674     const char *source_path;
1675 
1676     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1677     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1678 
1679     code << tab << "// User QFunction source\n";
1680     code << tab << "#include \"" << source_path << "\"\n\n";
1681   }
1682 
1683   // Setup
1684   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1685   code << tab << "// Operator Assembly Kernel\n";
1686   code << tab << "// \n";
1687   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1688   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1689   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1690   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1691   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1692   code << tab << "// \n";
1693   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1694   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1695   code << tab << "// -----------------------------------------------------------------------------\n";
1696   code << tab << "extern \"C\" __global__ void " << operator_name
1697        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
1698           "points, CeedScalar *__restrict__ values_array) {\n";
1699   tab.push();
1700 
1701   // Scratch buffers
1702   for (CeedInt i = 0; i < num_input_fields; i++) {
1703     CeedEvalMode eval_mode;
1704 
1705     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1706     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1707       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1708     }
1709   }
1710   for (CeedInt i = 0; i < num_output_fields; i++) {
1711     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1712   }
1713 
1714   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1715   code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1716   code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1717   code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1718 
1719   // Shared data
1720   code << tab << "extern __shared__ CeedScalar slice[];\n";
1721   code << tab << "SharedData_Hip data;\n";
1722   code << tab << "data.t_id_x = threadIdx.x;\n";
1723   code << tab << "data.t_id_y = threadIdx.y;\n";
1724   code << tab << "data.t_id_z = threadIdx.z;\n";
1725   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1726   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1727 
1728   // -- Determine input mat reuse
1729   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1730 
1731   for (CeedInt i = 0; i < num_input_fields; i++) {
1732     input_matrix_reuse[i].index = -1;
1733   }
1734   for (CeedInt i = 0; i < num_input_fields; i++) {
1735     CeedEvalMode eval_mode_i;
1736     CeedBasis    basis_i;
1737 
1738     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1739     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1740     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1741     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1742       CeedEvalMode eval_mode_j;
1743       CeedBasis    basis_j;
1744 
1745       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1746       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1747       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1748       if (basis_i == basis_j) {
1749         input_matrix_reuse[i].index     = j;
1750         input_matrix_reuse[i].is_input  = true;
1751         input_matrix_reuse[i].eval_mode = eval_mode_j;
1752       }
1753       CeedCallBackend(CeedBasisDestroy(&basis_j));
1754     }
1755     CeedCallBackend(CeedBasisDestroy(&basis_i));
1756   }
1757 
1758   // -- Determine output mat reuse
1759   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1760 
1761   for (CeedInt i = 0; i < num_output_fields; i++) {
1762     output_matrix_reuse[i].index = -1;
1763   }
1764   for (CeedInt i = 0; i < num_output_fields; i++) {
1765     CeedEvalMode eval_mode_i;
1766     CeedBasis    basis_i;
1767 
1768     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1769     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1770     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1771       CeedEvalMode eval_mode_j;
1772       CeedBasis    basis_j;
1773 
1774       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1775       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1776       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1777       if (basis_i == basis_j) {
1778         output_matrix_reuse[i].index     = j;
1779         output_matrix_reuse[i].is_input  = true;
1780         output_matrix_reuse[i].eval_mode = eval_mode_j;
1781       }
1782       CeedCallBackend(CeedBasisDestroy(&basis_j));
1783     }
1784     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1785       CeedEvalMode eval_mode_j;
1786       CeedBasis    basis_j;
1787 
1788       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1789       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1790       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1791       if (basis_i == basis_j) {
1792         output_matrix_reuse[i].index     = j;
1793         output_matrix_reuse[i].is_input  = false;
1794         output_matrix_reuse[i].eval_mode = eval_mode_j;
1795       }
1796       CeedCallBackend(CeedBasisDestroy(&basis_j));
1797     }
1798     CeedCallBackend(CeedBasisDestroy(&basis_i));
1799   }
1800 
1801   // Initialize constants, and matrices B and G
1802   code << "\n" << tab << "// Input field constants and basis data\n";
1803   for (CeedInt i = 0; i < num_input_fields; i++) {
1804     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1805                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1806   }
1807   code << "\n" << tab << "// Output field constants and basis data\n";
1808   for (CeedInt i = 0; i < num_output_fields; i++) {
1809     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1810                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1811   }
1812 
1813   // Loop over all elements
1814   code << "\n" << tab << "// Element loop\n";
1815   code << tab << "__syncthreads();\n";
1816   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1817   tab.push();
1818 
1819   // -- Compute minimum buffer space needed
1820   CeedInt max_rstr_buffer_size = 1;
1821 
1822   for (CeedInt i = 0; i < num_input_fields; i++) {
1823     CeedEvalMode eval_mode;
1824 
1825     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1826     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1827       CeedInt             num_comp;
1828       CeedElemRestriction elem_rstr;
1829 
1830       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1831       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1832       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1833       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1834     }
1835   }
1836   for (CeedInt i = 0; i < num_output_fields; i++) {
1837     CeedEvalMode eval_mode;
1838 
1839     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1840     if (eval_mode != CEED_EVAL_NONE) {
1841       CeedInt             num_comp;
1842       CeedElemRestriction elem_rstr;
1843 
1844       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1845       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1846       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1847       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1848     }
1849   }
1850   code << tab << "// Scratch restriction buffer space\n";
1851   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1852 
1853   // -- Determine best input field processing order
1854   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1855 
1856   for (CeedInt i = 0; i < num_input_fields; i++) {
1857     field_rstr_in_buffer[i] = -1;
1858     input_field_order[i]    = -1;
1859   }
1860   {
1861     bool    is_ordered[CEED_FIELD_MAX];
1862     CeedInt curr_index = 0;
1863 
1864     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1865     for (CeedInt i = 0; i < num_input_fields; i++) {
1866       CeedVector          vec_i;
1867       CeedElemRestriction rstr_i;
1868 
1869       if (is_ordered[i]) continue;
1870       field_rstr_in_buffer[i]       = i;
1871       is_ordered[i]                 = true;
1872       input_field_order[curr_index] = i;
1873       curr_index++;
1874       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1875       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1876       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1877       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1878         CeedVector          vec_j;
1879         CeedElemRestriction rstr_j;
1880 
1881         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1882         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1883         if (rstr_i == rstr_j && vec_i == vec_j) {
1884           field_rstr_in_buffer[j]       = i;
1885           is_ordered[j]                 = true;
1886           input_field_order[curr_index] = j;
1887           curr_index++;
1888         }
1889         CeedCallBackend(CeedVectorDestroy(&vec_j));
1890         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1891       }
1892       CeedCallBackend(CeedVectorDestroy(&vec_i));
1893       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1894     }
1895   }
1896 
1897   // -- Input restriction and basis
1898   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1899   CeedInt active_field_index = -1;
1900 
1901   for (CeedInt i = 0; i < num_input_fields; i++) {
1902     bool          is_active = false;
1903     const char   *field_name;
1904     const CeedInt f = input_field_order[i];
1905 
1906     {
1907       CeedVector vec;
1908 
1909       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1910       is_active = vec == CEED_VECTOR_ACTIVE;
1911       CeedCallBackend(CeedVectorDestroy(&vec));
1912     }
1913 
1914     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1915     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1916 
1917     if (is_active) {
1918       std::string var_suffix = "_in_" + std::to_string(f);
1919 
1920       code << tab << "// Active field - no restriction or basis action here\n";
1921       if (active_field_index == -1) {
1922         active_field_index = f;
1923         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
1924              << "] = {0.0};\n";
1925       } else {
1926         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
1927       }
1928     } else {
1929       // ---- Restriction
1930       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1931                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1932 
1933       // ---- Basis action
1934       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1935                                                            is_all_tensor, is_at_points, use_3d_slices));
1936     }
1937   }
1938 
1939   // -- Loop over active field
1940   std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
1941 
1942   code << "\n" << tab << "// Loop over nodes in active field\n";
1943   code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
1944        << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
1945   tab.push();
1946 
1947   // -- Set current active node and component to 1
1948   code << tab << "// Set current active node and component to 1.0\n";
1949   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
1950        << active_var_suffix << ");\n\n";
1951 
1952   for (CeedInt i = 0; i < num_input_fields; i++) {
1953     bool          is_active = false;
1954     const char   *field_name;
1955     const CeedInt f = input_field_order[i];
1956 
1957     {
1958       CeedVector vec;
1959 
1960       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1961       is_active = vec == CEED_VECTOR_ACTIVE;
1962       CeedCallBackend(CeedVectorDestroy(&vec));
1963     }
1964     if (!is_active) continue;
1965 
1966     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1967     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1968 
1969     // ---- Basis action
1970     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1971                                                          is_all_tensor, is_at_points, use_3d_slices));
1972   }
1973 
1974   // -- Q function
1975   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1976                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1977                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
1978 
1979   // -- Output basis and restriction
1980   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
1981   for (CeedInt i = 0; i < num_output_fields; i++) {
1982     bool        is_active = false;
1983     const char *field_name;
1984 
1985     {
1986       CeedVector vec;
1987 
1988       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
1989       is_active = vec == CEED_VECTOR_ACTIVE;
1990       CeedCallBackend(CeedVectorDestroy(&vec));
1991     }
1992     if (!is_active) continue;
1993 
1994     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1995     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1996 
1997     // ---- Basis action
1998     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
1999                                                          is_all_tensor, is_at_points, use_3d_slices));
2000 
2001     // ---- Restriction
2002     if (is_full) {
2003       std::string         var_suffix = "_out_" + std::to_string(i);
2004       CeedInt             comp_stride;
2005       CeedSize            l_size;
2006       CeedElemRestriction elem_rstr;
2007 
2008       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2009       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2010       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2011       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2012       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2013       code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2014            << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
2015       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2016     } else {
2017       std::string         var_suffix = "_out_" + std::to_string(i);
2018       CeedInt             comp_stride;
2019       CeedSize            l_size;
2020       CeedElemRestriction elem_rstr;
2021 
2022       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2023       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2024       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2025       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2026       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2027       code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2028            << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
2029       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2030     }
2031   }
2032 
2033   // -- Reset current active node and component
2034   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2035   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
2036        << active_var_suffix << ");\n";
2037 
2038   // -- End of loop over active field
2039   tab.pop();
2040   code << tab << "}\n";
2041 
2042   // Close loop and function
2043   tab.pop();
2044   code << tab << "}\n";
2045   tab.pop();
2046   code << tab << "}\n";
2047   code << tab << "// -----------------------------------------------------------------------------\n\n";
2048 
2049   CeedInt block_sizes[3] = {0, 0, 0};
2050   CeedInt num_elem;
2051 
2052   // Compile
2053   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2054   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2055   {
2056     bool is_compile_good = false;
2057 
2058     data->thread_1d = block_sizes[0];
2059     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good,
2060                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0],
2061                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2062     if (is_compile_good) {
2063       *is_good_build = true;
2064       CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
2065                                         is_full ? &data->assemble_full : &data->assemble_diagonal));
2066     } else {
2067       *is_good_build              = false;
2068       data->use_assembly_fallback = true;
2069     }
2070   }
2071   CeedCallBackend(CeedDestroy(&ceed));
2072   CeedCallBackend(CeedQFunctionDestroy(&qf));
2073   return CEED_ERROR_SUCCESS;
2074 }
2075 
2076 extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2077   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build);
2078 }
2079 
2080 extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2081   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build);
2082 }
2083 //------------------------------------------------------------------------------
2084 // Build QFunction assembly operator kernel
2085 //------------------------------------------------------------------------------
2086 extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) {
2087   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
2088   Ceed                   ceed;
2089   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0;
2090   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
2091   CeedQFunction_Hip_gen *qf_data;
2092   CeedQFunction          qf;
2093   CeedOperatorField     *op_input_fields, *op_output_fields;
2094   CeedOperator_Hip_gen  *data;
2095   std::ostringstream     code;
2096   Tab                    tab;
2097 
2098   // Check compatibility
2099   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
2100   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
2101   CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported");
2102 
2103   // Check field compatibility
2104   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
2105   {
2106     bool has_shared_bases = true;
2107 
2108     for (CeedInt i = 0; i < num_input_fields; i++) {
2109       CeedBasis basis;
2110 
2111       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
2112       if (basis != CEED_BASIS_NONE) {
2113         bool        is_tensor = true;
2114         const char *resource;
2115         char       *resource_root;
2116         Ceed        basis_ceed;
2117 
2118         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2119         is_all_tensor    = is_all_tensor && is_tensor;
2120         is_all_nontensor = is_all_nontensor && !is_tensor;
2121         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2122         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2123         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2124         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2125         CeedCallBackend(CeedFree(&resource_root));
2126         CeedCallBackend(CeedDestroy(&basis_ceed));
2127       }
2128       CeedCallBackend(CeedBasisDestroy(&basis));
2129     }
2130 
2131     for (CeedInt i = 0; i < num_output_fields; i++) {
2132       CeedBasis basis;
2133 
2134       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
2135       if (basis != CEED_BASIS_NONE) {
2136         bool        is_tensor = true;
2137         const char *resource;
2138         char       *resource_root;
2139         Ceed        basis_ceed;
2140 
2141         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2142         is_all_tensor    = is_all_tensor && is_tensor;
2143         is_all_nontensor = is_all_nontensor && !is_tensor;
2144 
2145         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2146         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2147         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2148         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2149         CeedCallBackend(CeedFree(&resource_root));
2150         CeedCallBackend(CeedDestroy(&basis_ceed));
2151       }
2152       CeedCallBackend(CeedBasisDestroy(&basis));
2153     }
2154   }
2155 
2156   // Retrieve operator data
2157   CeedCallBackend(CeedOperatorGetData(op, &data));
2158   Q       = data->Q;
2159   Q_1d    = data->Q_1d;
2160   max_dim = data->dim;
2161   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
2162   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
2163   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
2164 
2165   // Load basis source files
2166   if (!is_all_nontensor) {
2167     code << tab << "// Tensor basis source\n";
2168     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
2169   }
2170   if (!is_all_tensor) {
2171     code << tab << "// Non-tensor basis source\n";
2172     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
2173   }
2174   if (!is_all_tensor && !is_all_nontensor) {
2175     code << "// Tensor basis source\n";
2176     code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
2177   }
2178   code << "// CodeGen operator source\n";
2179   code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
2180 
2181   // Get QFunction name
2182   std::string qfunction_name(qf_data->qfunction_name);
2183   std::string operator_name;
2184 
2185   operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name;
2186 
2187   // Define CEED_Q_VLA
2188   code << "\n" << tab << "#undef CEED_Q_VLA\n";
2189   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
2190     code << tab << "#define CEED_Q_VLA 1\n\n";
2191   } else {
2192     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
2193   }
2194 
2195   // Add user QFunction source
2196   {
2197     const char *source_path;
2198 
2199     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
2200     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
2201 
2202     code << tab << "// User QFunction source\n";
2203     code << tab << "#include \"" << source_path << "\"\n\n";
2204   }
2205 
2206   // Setup
2207   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
2208   code << tab << "// Operator Assembly Kernel\n";
2209   code << tab << "// \n";
2210   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
2211   code << tab << "// r_[in,out]_e_i: Element vector register\n";
2212   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
2213   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
2214   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
2215   code << tab << "// \n";
2216   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
2217   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
2218   code << tab << "// -----------------------------------------------------------------------------\n";
2219   code << tab << "extern \"C\" __global__ void " << operator_name
2220        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
2221           "points, CeedScalar *__restrict__ values_array) {\n";
2222   tab.push();
2223 
2224   // Scratch buffers
2225   for (CeedInt i = 0; i < num_input_fields; i++) {
2226     CeedEvalMode eval_mode;
2227 
2228     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2229     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
2230       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
2231     }
2232   }
2233   for (CeedInt i = 0; i < num_output_fields; i++) {
2234     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
2235   }
2236 
2237   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
2238   if (!is_all_tensor) {
2239     code << tab << "const CeedInt Q = " << Q << ";\n";
2240   }
2241   if (!is_all_nontensor) {
2242     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
2243   }
2244 
2245   // Shared data
2246   code << tab << "extern __shared__ CeedScalar slice[];\n";
2247   code << tab << "SharedData_Hip data;\n";
2248   code << tab << "data.t_id_x = threadIdx.x;\n";
2249   code << tab << "data.t_id_y = threadIdx.y;\n";
2250   code << tab << "data.t_id_z = threadIdx.z;\n";
2251   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
2252   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
2253 
2254   // -- Determine input mat reuse
2255   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
2256 
2257   for (CeedInt i = 0; i < num_input_fields; i++) {
2258     input_matrix_reuse[i].index = -1;
2259   }
2260   for (CeedInt i = 0; i < num_input_fields; i++) {
2261     bool         is_tensor = true;
2262     CeedEvalMode eval_mode_i;
2263     CeedBasis    basis_i;
2264 
2265     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
2266     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
2267     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
2268     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2269     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
2270       CeedEvalMode eval_mode_j;
2271       CeedBasis    basis_j;
2272 
2273       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2274       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2275       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2276       if (basis_i == basis_j) {
2277         if (is_tensor) {
2278           input_matrix_reuse[i].index     = j;
2279           input_matrix_reuse[i].is_input  = true;
2280           input_matrix_reuse[i].eval_mode = eval_mode_j;
2281         } else {
2282           // For non-tensor can only re-use with the same eval mode
2283           if (eval_mode_i == eval_mode_j) {
2284             input_matrix_reuse[i].index     = j;
2285             input_matrix_reuse[i].is_input  = true;
2286             input_matrix_reuse[i].eval_mode = eval_mode_j;
2287           }
2288         }
2289       }
2290       CeedCallBackend(CeedBasisDestroy(&basis_j));
2291     }
2292     CeedCallBackend(CeedBasisDestroy(&basis_i));
2293   }
2294 
2295   // -- Determine output mat reuse
2296   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
2297 
2298   for (CeedInt i = 0; i < num_output_fields; i++) {
2299     output_matrix_reuse[i].index = -1;
2300   }
2301   for (CeedInt i = 0; i < num_output_fields; i++) {
2302     bool         is_tensor = true;
2303     CeedEvalMode eval_mode_i;
2304     CeedBasis    basis_i;
2305 
2306     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
2307     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
2308     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2309     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
2310       CeedEvalMode eval_mode_j;
2311       CeedBasis    basis_j;
2312 
2313       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2314       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2315       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2316       if (basis_i == basis_j) {
2317         if (is_tensor) {
2318           output_matrix_reuse[i].index     = j;
2319           output_matrix_reuse[i].is_input  = true;
2320           output_matrix_reuse[i].eval_mode = eval_mode_j;
2321         } else {
2322           // For non-tensor can only re-use with the same eval mode
2323           if (eval_mode_i == eval_mode_j) {
2324             output_matrix_reuse[i].index     = j;
2325             output_matrix_reuse[i].is_input  = true;
2326             output_matrix_reuse[i].eval_mode = eval_mode_j;
2327           }
2328         }
2329       }
2330       CeedCallBackend(CeedBasisDestroy(&basis_j));
2331     }
2332     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
2333       CeedEvalMode eval_mode_j;
2334       CeedBasis    basis_j;
2335 
2336       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
2337       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2338       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
2339       if (basis_i == basis_j) {
2340         if (is_tensor) {
2341           output_matrix_reuse[i].index     = j;
2342           output_matrix_reuse[i].is_input  = false;
2343           output_matrix_reuse[i].eval_mode = eval_mode_j;
2344         } else {
2345           // For non-tensor can only re-use with the same eval mode
2346           if (eval_mode_i == eval_mode_j) {
2347             output_matrix_reuse[i].index     = j;
2348             output_matrix_reuse[i].is_input  = false;
2349             output_matrix_reuse[i].eval_mode = eval_mode_j;
2350           }
2351         }
2352       }
2353       CeedCallBackend(CeedBasisDestroy(&basis_j));
2354     }
2355     CeedCallBackend(CeedBasisDestroy(&basis_i));
2356   }
2357 
2358   // Initialize constants, and matrices B and G
2359   code << "\n" << tab << "// Input field constants and basis data\n";
2360   for (CeedInt i = 0; i < num_input_fields; i++) {
2361     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
2362                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
2363   }
2364   code << "\n" << tab << "// Output field constants and basis data\n";
2365   for (CeedInt i = 0; i < num_output_fields; i++) {
2366     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
2367                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
2368   }
2369 
2370   // Loop over all elements
2371   code << "\n" << tab << "// Element loop\n";
2372   code << tab << "__syncthreads();\n";
2373   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
2374   tab.push();
2375 
2376   // -- Compute minimum buffer space needed
2377   CeedInt max_rstr_buffer_size = 1;
2378 
2379   for (CeedInt i = 0; i < num_input_fields; i++) {
2380     CeedEvalMode eval_mode;
2381 
2382     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2383     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
2384       CeedInt             num_comp;
2385       CeedElemRestriction elem_rstr;
2386 
2387       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
2388       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2389       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2390       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2391     }
2392   }
2393   for (CeedInt i = 0; i < num_output_fields; i++) {
2394     CeedEvalMode eval_mode;
2395 
2396     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
2397     if (eval_mode != CEED_EVAL_NONE) {
2398       CeedInt             num_comp;
2399       CeedElemRestriction elem_rstr;
2400 
2401       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2402       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2403       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2404       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2405     }
2406   }
2407   code << tab << "// Scratch restriction buffer space\n";
2408   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
2409 
2410   // -- Determine best input field processing order
2411   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
2412 
2413   for (CeedInt i = 0; i < num_input_fields; i++) {
2414     field_rstr_in_buffer[i] = -1;
2415     input_field_order[i]    = -1;
2416   }
2417   {
2418     bool    is_ordered[CEED_FIELD_MAX];
2419     CeedInt curr_index = 0;
2420 
2421     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
2422     for (CeedInt i = 0; i < num_input_fields; i++) {
2423       CeedVector          vec_i;
2424       CeedElemRestriction rstr_i;
2425 
2426       if (is_ordered[i]) continue;
2427       field_rstr_in_buffer[i]       = i;
2428       is_ordered[i]                 = true;
2429       input_field_order[curr_index] = i;
2430       curr_index++;
2431       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
2432       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
2433       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
2434       for (CeedInt j = i + 1; j < num_input_fields; j++) {
2435         CeedVector          vec_j;
2436         CeedElemRestriction rstr_j;
2437 
2438         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
2439         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
2440         if (rstr_i == rstr_j && vec_i == vec_j) {
2441           field_rstr_in_buffer[j]       = i;
2442           is_ordered[j]                 = true;
2443           input_field_order[curr_index] = j;
2444           curr_index++;
2445         }
2446         CeedCallBackend(CeedVectorDestroy(&vec_j));
2447         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
2448       }
2449       CeedCallBackend(CeedVectorDestroy(&vec_i));
2450       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
2451     }
2452   }
2453 
2454   // -- Input restriction and basis
2455   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
2456   CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0;
2457   CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX];
2458 
2459   for (CeedInt i = 0; i < num_input_fields; i++) {
2460     bool          is_active = false;
2461     const char   *field_name;
2462     const CeedInt f = input_field_order[i];
2463 
2464     {
2465       CeedVector vec;
2466 
2467       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
2468       is_active = vec == CEED_VECTOR_ACTIVE;
2469       CeedCallBackend(CeedVectorDestroy(&vec));
2470     }
2471 
2472     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
2473     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
2474 
2475     if (is_active) {
2476       CeedEvalMode eval_mode;
2477       CeedInt      field_size;
2478 
2479       active_fields_in[num_active_in] = f;
2480       num_active_in++;
2481       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
2482       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
2483       if (eval_mode == CEED_EVAL_GRAD) {
2484         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
2485              << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2486       } else {
2487         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2488       }
2489       code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n";
2490     } else {
2491       // ---- Restriction
2492       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
2493                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
2494 
2495       // ---- Basis action
2496       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
2497                                                            is_all_tensor, is_at_points, use_3d_slices));
2498     }
2499   }
2500   code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {";
2501   for (CeedInt i = 0; i < num_active_in; i++) {
2502     code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2503   }
2504   code << "};\n";
2505   code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {";
2506   for (CeedInt i = 0; i < num_active_in; i++) {
2507     code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2508   }
2509   code << "};\n";
2510 
2511   for (CeedInt i = 0; i < num_output_fields; i++) {
2512     bool is_active = false;
2513 
2514     {
2515       CeedVector vec;
2516 
2517       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2518       is_active = vec == CEED_VECTOR_ACTIVE;
2519       CeedCallBackend(CeedVectorDestroy(&vec));
2520     }
2521     if (is_active) {
2522       const char *field_name;
2523       CeedInt     field_size;
2524 
2525       active_fields_out[num_active_out] = i;
2526       num_active_out++;
2527       CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2528       qf_assembly_size_out += field_size;
2529       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2530       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2531       code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n";
2532     }
2533   }
2534   code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {";
2535   for (CeedInt i = 0; i < num_active_out; i++) {
2536     code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : "");
2537   }
2538   code << "};\n";
2539   code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n";
2540 
2541   // -- Loop over active field
2542   code << "\n" << tab << "CeedInt input_offset = 0;\n";
2543   code << tab << "// Loop over active QFunction input fields\n";
2544   code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n";
2545   code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n";
2546   tab.push();
2547 
2548   // -- Loop over size of active field
2549   code << "\n" << tab << "// Loop over current active input field size\n";
2550   code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n";
2551   code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n";
2552   tab.push();
2553 
2554   // -- Set current active point and component to 1
2555   code << tab << "// Set current active point and component to 1.0\n";
2556   if (is_all_tensor && (max_dim >= 3)) {
2557     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n";
2558   } else {
2559     code << tab << "r_q_in[a][s] = 1.0;\n";
2560   }
2561 
2562   // -- Q function
2563   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
2564                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
2565                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
2566 
2567   // -- Output basis and restriction
2568   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
2569   CeedScalar offset = 0;
2570 
2571   for (CeedInt i = 0; i < num_output_fields; i++) {
2572     bool        is_active = false;
2573     const char *field_name;
2574 
2575     {
2576       CeedVector vec;
2577 
2578       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2579       is_active = vec == CEED_VECTOR_ACTIVE;
2580       CeedCallBackend(CeedVectorDestroy(&vec));
2581     }
2582     if (!is_active) continue;
2583 
2584     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2585     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2586 
2587     // ---- Restriction
2588     CeedInt field_size;
2589 
2590     code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
2591          << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
2592     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2593     offset += field_size;
2594   }
2595 
2596   // -- Reset current active node and component
2597   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2598   if (is_all_tensor && (max_dim >= 3)) {
2599     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n";
2600   } else {
2601     code << tab << "r_q_in[a][s] = 0.0;\n";
2602   }
2603 
2604   // -- End of loop over size of active field
2605   tab.pop();
2606   code << tab << "}\n";
2607   code << tab << "input_offset += field_size_in;\n";
2608 
2609   // -- End of loop over active field
2610   tab.pop();
2611   code << tab << "}\n";
2612 
2613   // Close loop and function
2614   tab.pop();
2615   code << tab << "}\n";
2616   tab.pop();
2617   code << tab << "}\n";
2618   code << tab << "// -----------------------------------------------------------------------------\n\n";
2619 
2620   CeedInt block_sizes[3] = {0, 0, 0};
2621   CeedInt num_elem;
2622 
2623   // Compile
2624   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2625   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2626   {
2627     bool is_compile_good = false;
2628 
2629     data->thread_1d = block_sizes[0];
2630     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0],
2631                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2632     if (is_compile_good) {
2633       *is_good_build = true;
2634       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
2635     } else {
2636       *is_good_build              = false;
2637       data->use_assembly_fallback = true;
2638     }
2639   }
2640   CeedCallBackend(CeedDestroy(&ceed));
2641   CeedCallBackend(CeedQFunctionDestroy(&qf));
2642   return CEED_ERROR_SUCCESS;
2643 }
2644 
2645 //------------------------------------------------------------------------------
2646