xref: /libCEED/rust/libceed-sys/c-src/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 0183ed61035d97ff853cf8c8e722c0fda76e54df)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/gen-tools.h>
13 #include <ceed/jit-tools.h>
14 #include <cuda_runtime.h>
15 
16 #include <iostream>
17 #include <sstream>
18 #include <string>
19 
20 #include "../cuda-ref/ceed-cuda-ref.h"
21 #include "../cuda-shared/ceed-cuda-shared.h"
22 #include "../cuda/ceed-cuda-common.h"
23 #include "../cuda/ceed-cuda-compile.h"
24 #include "ceed-cuda-gen.h"
25 
26 struct FieldReuse_Cuda {
27   CeedInt      index;
28   bool         is_input;
29   CeedEvalMode eval_mode;
30 };
31 
32 //------------------------------------------------------------------------------
33 // Determine type of operator
34 //------------------------------------------------------------------------------
35 static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
36                                                 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
37                                                 CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
38                                                 CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
39   // Check if all are tensor
40   *is_all_tensor = true;
41   for (CeedInt i = 0; i < num_input_fields; i++) {
42     CeedBasis basis;
43 
44     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
45     if (basis != CEED_BASIS_NONE) {
46       bool is_field_tensor;
47 
48       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
49       *is_all_tensor = *is_all_tensor && is_field_tensor;
50     }
51     CeedCallBackend(CeedBasisDestroy(&basis));
52   }
53   for (CeedInt i = 0; i < num_output_fields; i++) {
54     CeedBasis basis;
55 
56     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
57     if (basis != CEED_BASIS_NONE) {
58       bool is_field_tensor;
59 
60       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
61       *is_all_tensor = *is_all_tensor && is_field_tensor;
62     }
63     CeedCallBackend(CeedBasisDestroy(&basis));
64   }
65 
66   // Find max_P, max_P_1d, Q, and Q_1d
67   bool is_all_3d = true;
68 
69   *max_P    = 0;
70   *max_P_1d = 0;
71   *Q        = 0;
72   *Q_1d     = 0;
73   for (CeedInt i = 0; i < num_input_fields; i++) {
74     CeedBasis basis;
75 
76     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
77     if (basis != CEED_BASIS_NONE) {
78       bool    is_field_tensor;
79       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
80 
81       // Check if 3D
82       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
83       is_all_3d = is_all_3d && (field_dim == 3);
84       *max_dim  = CeedIntMax(*max_dim, field_dim);
85 
86       // Collect P, P_1d, Q, and Q_1d
87       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
88       *max_P = CeedIntMax(*max_P, field_P);
89       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
90       if (is_field_tensor) {
91         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
92         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
93       }
94       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
95       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
96       *Q = field_Q;
97       if (is_field_tensor) {
98         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
99         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
100         *Q_1d = field_Q_1d;
101       }
102     }
103     CeedCallBackend(CeedBasisDestroy(&basis));
104   }
105   for (CeedInt i = 0; i < num_output_fields; i++) {
106     CeedBasis basis;
107 
108     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
109     if (basis != CEED_BASIS_NONE) {
110       bool    is_field_tensor;
111       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
112 
113       // Check if 3D
114       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
115       is_all_3d = is_all_3d && (field_dim == 3);
116       *max_dim  = CeedIntMax(*max_dim, field_dim);
117 
118       // Collect P, P_1d, Q, and Q_1d
119       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
120       *max_P = CeedIntMax(*max_P, field_P);
121       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
122       if (is_field_tensor) {
123         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
124         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
125       }
126       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
127       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
128       *Q = field_Q;
129       if (is_field_tensor) {
130         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
131         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
132         *Q_1d = field_Q_1d;
133       }
134     }
135     CeedCallBackend(CeedBasisDestroy(&basis));
136   }
137 
138   // Only use 3D collocated gradient parallelization strategy when gradient is computed
139   *use_3d_slices = false;
140   if (is_all_3d && *is_all_tensor) {
141     bool was_grad_found = false;
142 
143     for (CeedInt i = 0; i < num_input_fields; i++) {
144       CeedEvalMode eval_mode;
145 
146       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
147       if (eval_mode == CEED_EVAL_GRAD) {
148         CeedBasis_Cuda_shared *basis_data;
149         CeedBasis              basis;
150 
151         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
152         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
153         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
154         was_grad_found = true;
155         CeedCallBackend(CeedBasisDestroy(&basis));
156       }
157     }
158     for (CeedInt i = 0; i < num_output_fields; i++) {
159       CeedEvalMode eval_mode;
160 
161       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
162       if (eval_mode == CEED_EVAL_GRAD) {
163         CeedBasis_Cuda_shared *basis_data;
164         CeedBasis              basis;
165 
166         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
167         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
168         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
169         was_grad_found = true;
170         CeedCallBackend(CeedBasisDestroy(&basis));
171       }
172     }
173   }
174   return CEED_ERROR_SUCCESS;
175 }
176 
177 //------------------------------------------------------------------------------
178 // Setup fields
179 //------------------------------------------------------------------------------
180 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
181                                                      CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse,
182                                                      CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
183                                                      bool use_3d_slices) {
184   bool      is_tensor = true;
185   CeedBasis basis;
186   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
187   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
188 
189   const char            *field_name;
190   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
191   std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
192   std::string            option_name = (is_input ? "inputs" : "outputs");
193   CeedEvalMode           eval_mode   = CEED_EVAL_NONE;
194   CeedInt                elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
195   CeedElemRestriction    elem_rstr;
196   CeedBasis_Cuda_shared *basis_data;
197 
198   // Field reuse info
199   bool use_previous_field = field_reuse.index != -1;
200 
201   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
202   code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
203 
204   // Get field data
205   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
206   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
207     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
208     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
209   }
210   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
211   if (basis != CEED_BASIS_NONE) {
212     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
213     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
214     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
215     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
216   }
217   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
218 
219   // Set field constants
220   code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
221   if (is_tensor && !is_all_tensor) {
222     CeedInt P = 0;
223 
224     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
225     code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
226   }
227   code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
228   if (eval_mode != CEED_EVAL_WEIGHT) {
229     code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
230   }
231 
232   // Load basis data
233   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
234   switch (eval_mode) {
235     case CEED_EVAL_NONE:
236       break;
237     case CEED_EVAL_INTERP:
238       if (is_at_points) {
239         // AtPoints
240         if (!basis_data->d_chebyshev_interp_1d) {
241           CeedSize    interp_bytes;
242           CeedScalar *chebyshev_interp_1d;
243 
244           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
245           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
246           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
247           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
248           CeedCallCuda(CeedBasisReturnCeed(basis),
249                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
250           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
251         }
252         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
253         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
254       } else {
255         // Standard quadrature
256         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
257         else data->B.outputs[i] = basis_data->d_interp_1d;
258       }
259       if (use_previous_field) {
260         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
261 
262         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
263       } else {
264         code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
265         code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
266       }
267       break;
268     case CEED_EVAL_GRAD:
269       if (is_at_points) {
270         // AtPoints
271         if (!basis_data->d_chebyshev_interp_1d) {
272           CeedSize    interp_bytes;
273           CeedScalar *chebyshev_interp_1d;
274 
275           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
276           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
277           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
278           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
279           CeedCallCuda(CeedBasisReturnCeed(basis),
280                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
281           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
282         }
283         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
284         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
285       } else {
286         // Standard quadrature
287         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
288         else data->B.outputs[i] = basis_data->d_interp_1d;
289       }
290       if (is_tensor) {
291         if (use_previous_field) {
292           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
293 
294           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
295         } else {
296           code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
297           code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
298         }
299       }
300       if (is_at_points) break;  // No G mat for AtPoints
301       if (use_3d_slices) {
302         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
303         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
304         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
305           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
306 
307           code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
308         } else {
309           code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
310           code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
311         }
312       } else {
313         bool has_collo_grad = basis_data->d_collo_grad_1d;
314 
315         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
316         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
317         if (has_collo_grad) {
318           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
319             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
320 
321             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
322           } else {
323             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
324             code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
325           }
326         } else {
327           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
328             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
329 
330             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
331           } else {
332             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
333                  << (is_tensor ? "" : var_suffix) << "];\n";
334             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
335                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
336           }
337         }
338       }
339       break;
340     case CEED_EVAL_WEIGHT:
341       break;  // No action
342       // LCOV_EXCL_START
343     case CEED_EVAL_DIV:
344     case CEED_EVAL_CURL:
345       break;  // TODO: Not implemented
346               // LCOV_EXCL_STOP
347   }
348   CeedCallBackend(CeedBasisDestroy(&basis));
349   return CEED_ERROR_SUCCESS;
350 }
351 
352 //------------------------------------------------------------------------------
353 // Restriction
354 //------------------------------------------------------------------------------
355 static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
356                                                        CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
357                                                        CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
358                                                        bool use_3d_slices) {
359   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
360   std::string               P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
361   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
362   CeedInt                   elem_size = 0, num_comp = 0;
363   CeedSize                  l_size;
364   CeedRestrictionType       rstr_type = CEED_RESTRICTION_STANDARD;
365   CeedElemRestriction_Cuda *rstr_data;
366   CeedElemRestriction       elem_rstr;
367 
368   // Get field data
369   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
370   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
371     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
372     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
373     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
374     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
375   }
376   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
377 
378   // Restriction
379   if (is_input) {
380     // Input
381     if (field_input_buffer[i] != i) {
382       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
383 
384       // Restriction was already done for previous input
385       code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
386     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
387       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
388         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
389         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
390       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
391         // Otherwise we're using the scratch space
392         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
393       }
394       switch (rstr_type) {
395         case CEED_RESTRICTION_STANDARD: {
396           CeedInt comp_stride;
397 
398           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
399           code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
400           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
401           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
402           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
403           code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
404                << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
405                << ");\n";
406           break;
407         }
408         case CEED_RESTRICTION_STRIDED: {
409           bool    has_backend_strides;
410           CeedInt num_elem;
411 
412           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
413           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
414           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
415 
416           if (!has_backend_strides) {
417             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
418           }
419           code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
420                << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
421           code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
422                << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
423                << var_suffix << ");\n";
424           break;
425         }
426         case CEED_RESTRICTION_POINTS: {
427           CeedInt comp_stride;
428 
429           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
430           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
431           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
432           break;
433         }
434         // LCOV_EXCL_START
435         case CEED_RESTRICTION_ORIENTED:
436         case CEED_RESTRICTION_CURL_ORIENTED:
437           break;  // TODO: Not implemented
438                   // LCOV_EXCL_STOP
439       }
440     }
441   } else {
442     // Output
443     switch (rstr_type) {
444       case CEED_RESTRICTION_STANDARD: {
445         CeedInt comp_stride;
446 
447         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
448         code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
449         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
450         code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
451         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
452         code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
453              << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
454              << ");\n";
455         break;
456       }
457       case CEED_RESTRICTION_STRIDED: {
458         bool    has_backend_strides;
459         CeedInt num_elem;
460 
461         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
462         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
463         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
464 
465         if (!has_backend_strides) {
466           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
467         }
468         code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
469              << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
470         code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
471              << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
472              << ");\n";
473         break;
474       }
475       case CEED_RESTRICTION_POINTS:
476         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
477         break;
478       // LCOV_EXCL_START
479       case CEED_RESTRICTION_ORIENTED:
480       case CEED_RESTRICTION_CURL_ORIENTED:
481         break;  // TODO: Not implemented
482                 // LCOV_EXCL_STOP
483     }
484   }
485   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
486   return CEED_ERROR_SUCCESS;
487 }
488 
489 //------------------------------------------------------------------------------
490 // Basis
491 //------------------------------------------------------------------------------
492 static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i,
493                                                  CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d,
494                                                  bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
495   bool      is_tensor = true;
496   CeedBasis basis;
497   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
498   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
499 
500   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
501   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
502   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
503   CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
504   CeedElemRestriction elem_rstr;
505 
506   // Get field data
507   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
508   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
509     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
510     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
511   }
512   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
513   if (basis != CEED_BASIS_NONE) {
514     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
515     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
516     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
517   }
518   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
519 
520   // Basis
521   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
522   if (is_input) {
523     switch (eval_mode) {
524       case CEED_EVAL_NONE:
525         if (!use_3d_slices && !is_at_points) {
526           code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
527         }
528         break;
529       case CEED_EVAL_INTERP:
530         if (is_at_points) {
531           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
532 
533           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
534           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
535                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
536         } else {
537           std::string function_name = is_tensor
538                                           ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
539                                           : "InterpNonTensor";
540           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
541 
542           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
543           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
544                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
545         }
546         break;
547       case CEED_EVAL_GRAD:
548         if (is_at_points) {
549           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
550 
551           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
552           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
553                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
554         } else if (use_3d_slices) {
555           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
556 
557           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
558           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
559                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
560         } else if (is_tensor) {
561           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
562           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
563                                       (is_all_tensor ? "" : "Flattened");
564           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
565 
566           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
567                << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
568           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
569                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
570         } else {
571           std::string function_name = "GradNonTensor";
572 
573           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
574           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
575                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
576         }
577         break;
578       case CEED_EVAL_WEIGHT: {
579         if (is_at_points) {
580           code << tab << "// Nothing to do AtPoints\n";
581         } else {
582           CeedBasis_Cuda_shared *basis_data;
583           std::string            function_name = is_tensor
584                                                      ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
585                                                      : "WeightNonTensor";
586 
587           code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
588           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
589           data->W = basis_data->d_q_weight_1d;
590           code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
591         }
592         break;
593       }
594       // LCOV_EXCL_START
595       case CEED_EVAL_DIV:
596       case CEED_EVAL_CURL:
597         break;  // TODO: Not implemented
598                 // LCOV_EXCL_STOP
599     }
600   } else {
601     switch (eval_mode) {
602       case CEED_EVAL_NONE:
603         code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
604         break;  // No action
605       case CEED_EVAL_INTERP:
606         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
607         if (is_at_points) {
608           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
609 
610           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
611                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
612         } else {
613           std::string function_name =
614               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
615                         : "InterpTransposeNonTensor";
616           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
617 
618           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
619                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
620         }
621         break;
622       case CEED_EVAL_GRAD:
623         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
624         if (is_at_points) {
625           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
626 
627           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
628                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
629         } else if (use_3d_slices) {
630           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
631 
632           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
633                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
634         } else if (is_tensor) {
635           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
636           std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
637                                       std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
638           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
639 
640           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
641                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
642         } else {
643           std::string function_name = "GradTransposeNonTensor";
644 
645           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
646                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
647         }
648         break;
649       // LCOV_EXCL_START
650       case CEED_EVAL_WEIGHT:
651         break;  // Should not occur
652       case CEED_EVAL_DIV:
653       case CEED_EVAL_CURL:
654         break;  // TODO: Not implemented
655                 // LCOV_EXCL_STOP
656     }
657   }
658   CeedCallBackend(CeedBasisDestroy(&basis));
659   return CEED_ERROR_SUCCESS;
660 }
661 
662 //------------------------------------------------------------------------------
663 // QFunction
664 //------------------------------------------------------------------------------
665 static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt max_dim,
666                                                      CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
667                                                      CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
668                                                      CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
669                                                      std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
670                                                      bool use_3d_slices) {
671   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
672   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
673   CeedElemRestriction elem_rstr;
674 
675   // Setup output arrays
676   code << "\n";
677   code << tab << "// -- Output field setup\n";
678   for (CeedInt i = 0; i < num_output_fields; i++) {
679     const char *field_name;
680     std::string var_suffix = "_out_" + std::to_string(i);
681 
682     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
683     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
684     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
685     switch (eval_mode) {
686       case CEED_EVAL_NONE:
687         if (is_at_points) {
688           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
689         } else {
690           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
691                << "];\n";
692         }
693         break;
694       case CEED_EVAL_INTERP:
695         if (is_at_points) {
696           // Accumulator for point data
697           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
698           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
699                << "[i] = 0.0;\n";
700         } else {
701           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
702                << "];\n";
703         }
704         break;
705       case CEED_EVAL_GRAD:
706         if (is_at_points) {
707           // Accumulator for point data
708           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
709           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
710                << "[i] = 0.0;\n";
711         } else if (use_3d_slices) {
712           // Accumulator for gradient slices
713           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
714           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
715         } else {
716           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
717                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
718         }
719         break;
720       case CEED_EVAL_WEIGHT:
721         break;
722         // LCOV_EXCL_START
723       case CEED_EVAL_DIV:
724       case CEED_EVAL_CURL:
725         break;  // TODO: Not implemented
726                 // LCOV_EXCL_STOP
727     }
728   }
729 
730   if (is_at_points) {
731     // We need to handle batches of points
732     code << "\n";
733     code << tab << "// Note: Using batches of points\n";
734     code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
735     code << tab << "#pragma unroll\n";
736     code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
737     tab.push();
738     code << tab << "const CeedInt p = i % max_num_points;\n\n";
739 
740     code << tab << "// -- Coordinates\n";
741     code << tab << "CeedScalar r_x[max_dim];\n";
742     code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
743 
744     code << tab << "// -- Input fields\n";
745     for (CeedInt i = 0; i < num_input_fields; i++) {
746       const char *field_name;
747       std::string var_suffix = "_in_" + std::to_string(i);
748       std::string P_name     = "P_1d" + var_suffix;
749 
750       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
751       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
752       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
753       // Basis action
754       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
755       switch (eval_mode) {
756         case CEED_EVAL_NONE:
757           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
758           code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
759                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
760           break;
761         case CEED_EVAL_INTERP:
762           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
763           code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
764                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
765           break;
766         case CEED_EVAL_GRAD:
767           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
768           code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
769                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
770           break;
771         case CEED_EVAL_WEIGHT:
772           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
773           code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
774           break;
775           // LCOV_EXCL_START
776         case CEED_EVAL_DIV:
777         case CEED_EVAL_CURL:
778           break;  // TODO: Not implemented
779                   // LCOV_EXCL_STOP
780       }
781     }
782     code << "\n";
783     code << tab << "// -- Output fields\n";
784     for (CeedInt i = 0; i < num_output_fields; i++) {
785       const char *field_name;
786       std::string var_suffix = "_out_" + std::to_string(i);
787 
788       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
789       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
790       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
791       // Basis action
792       switch (eval_mode) {
793         case CEED_EVAL_NONE:
794           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
795           break;
796         case CEED_EVAL_INTERP:
797           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
798           break;
799         case CEED_EVAL_GRAD:
800           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
801           break;
802           // LCOV_EXCL_START
803         case CEED_EVAL_WEIGHT:
804           break;  // Should not occur
805         case CEED_EVAL_DIV:
806         case CEED_EVAL_CURL:
807           break;  // TODO: Not implemented
808                   // LCOV_EXCL_STOP
809       }
810     }
811 
812   } else if (use_3d_slices) {
813     // We treat quadrature points per slice in 3d to save registers
814     code << "\n";
815     code << tab << "// Note: Using planes of 3D elements\n";
816     code << tab << "#pragma unroll\n";
817     code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
818     tab.push();
819     code << tab << "// -- Input fields\n";
820     for (CeedInt i = 0; i < num_input_fields; i++) {
821       const char *field_name;
822       std::string var_suffix = "_in_" + std::to_string(i);
823 
824       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
825       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
826       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
827       // Basis action
828       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
829       switch (eval_mode) {
830         case CEED_EVAL_NONE:
831           bool is_strided;
832 
833           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
834 
835           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
836           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
837           if (is_strided) {
838             bool    has_backend_strides;
839             CeedInt num_elem, elem_size;
840 
841             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
842             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
843             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
844             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
845 
846             if (!has_backend_strides) {
847               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
848             }
849             code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
850                  << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
851             code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
852                  << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
853           } else {
854             CeedSize                  l_size = 0;
855             CeedInt                   comp_stride;
856             CeedElemRestriction_Cuda *rstr_data;
857 
858             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
859             code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
860             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
861             code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
862             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
863             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
864             code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
865                  << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
866           }
867           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
868           break;
869         case CEED_EVAL_INTERP:
870           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
871           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
872           tab.push();
873           code << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
874           tab.pop();
875           code << tab << "}\n";
876           break;
877         case CEED_EVAL_GRAD:
878           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
879           code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
880                << var_suffix << ", r_s" << var_suffix << ");\n";
881           break;
882         case CEED_EVAL_WEIGHT:
883           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
884           code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
885           break;
886           // LCOV_EXCL_START
887         case CEED_EVAL_DIV:
888         case CEED_EVAL_CURL:
889           break;  // TODO: Not implemented
890                   // LCOV_EXCL_STOP
891       }
892     }
893     code << "\n";
894     code << tab << "// -- Output fields\n";
895     for (CeedInt i = 0; i < num_output_fields; i++) {
896       const char *field_name;
897       std::string var_suffix = "_out_" + std::to_string(i);
898 
899       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
900       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
901       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
902       // Basis action
903       switch (eval_mode) {
904         case CEED_EVAL_NONE:
905           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
906           break;
907         case CEED_EVAL_INTERP:
908           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
909           break;
910         case CEED_EVAL_GRAD:
911           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
912           break;
913           // LCOV_EXCL_START
914         case CEED_EVAL_WEIGHT:
915           break;  // Should not occur
916         case CEED_EVAL_DIV:
917         case CEED_EVAL_CURL:
918           break;  // TODO: Not implemented
919                   // LCOV_EXCL_STOP
920       }
921     }
922   } else {
923     code << "\n";
924     code << tab << "// Note: Using full elements\n";
925     code << tab << "{\n";
926     tab.push();
927     code << tab << "// -- Input fields\n";
928     for (CeedInt i = 0; i < num_input_fields; i++) {
929       const char *field_name;
930 
931       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
932       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
933       code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
934     }
935     code << tab << "// -- Output fields\n";
936     for (CeedInt i = 0; i < num_output_fields; i++) {
937       const char *field_name;
938 
939       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
940       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
941       code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
942     }
943   }
944 
945   // Input and output buffers
946   code << "\n";
947   code << tab << "// -- QFunction inputs and outputs\n";
948   code << tab << "// ---- Inputs\n";
949   code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
950   for (CeedInt i = 0; i < num_input_fields; i++) {
951     const char *field_name;
952 
953     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
954     code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
955     code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
956   }
957   code << tab << "// ---- Outputs\n";
958   code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
959   for (CeedInt i = 0; i < num_output_fields; i++) {
960     const char *field_name;
961 
962     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
963     code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
964     code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
965   }
966 
967   // Apply QFunction
968   code << "\n";
969   code << tab << "// -- Apply QFunction\n";
970   code << tab << "" << qfunction_name << "(ctx, ";
971   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
972     code << "1";
973   } else {
974     code << Q_name;
975   }
976   code << ", inputs, outputs);\n";
977 
978   if (is_at_points) {
979     // Map back to coefficients
980     code << "\n";
981     code << tab << "// -- Output fields\n";
982     for (CeedInt i = 0; i < num_output_fields; i++) {
983       const char *field_name;
984       std::string var_suffix = "_out_" + std::to_string(i);
985       std::string P_name     = "P_1d" + var_suffix;
986 
987       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
988       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
989       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
990       // Basis action
991       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
992       switch (eval_mode) {
993         case CEED_EVAL_NONE: {
994           CeedInt             comp_stride;
995           CeedElemRestriction elem_rstr;
996 
997           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
998           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
999           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1000           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
1001           code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
1002                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
1003                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
1004           break;
1005         }
1006         case CEED_EVAL_INTERP:
1007           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1008           tab.push();
1009           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1010           tab.pop();
1011           code << tab << "}\n";
1012           code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1013                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1014           break;
1015         case CEED_EVAL_GRAD:
1016           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1017           tab.push();
1018           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1019           tab.pop();
1020           code << tab << "}\n";
1021           code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1022                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1023           break;
1024           // LCOV_EXCL_START
1025         case CEED_EVAL_WEIGHT:
1026           break;  // Should not occur
1027         case CEED_EVAL_DIV:
1028         case CEED_EVAL_CURL:
1029           break;  // TODO: Not implemented
1030                   // LCOV_EXCL_STOP
1031       }
1032     }
1033   } else if (use_3d_slices) {
1034     // Copy or apply transpose grad, if needed
1035     code << "\n";
1036     code << tab << "// -- Output fields\n";
1037     for (CeedInt i = 0; i < num_output_fields; i++) {
1038       const char *field_name;
1039       std::string var_suffix = "_out_" + std::to_string(i);
1040       std::string P_name     = "P_1d" + var_suffix;
1041 
1042       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1043       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1044       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1045       // Basis action
1046       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1047       switch (eval_mode) {
1048         case CEED_EVAL_NONE:
1049           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1050           tab.push();
1051           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1052           tab.pop();
1053           code << tab << "}\n";
1054           break;
1055         case CEED_EVAL_INTERP:
1056           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1057           tab.push();
1058           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1059           tab.pop();
1060           code << tab << "}\n";
1061           break;
1062         case CEED_EVAL_GRAD:
1063           code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1064                << var_suffix << ", r_q" << var_suffix << ");\n";
1065           break;
1066           // LCOV_EXCL_START
1067         case CEED_EVAL_WEIGHT:
1068           break;  // Should not occur
1069         case CEED_EVAL_DIV:
1070         case CEED_EVAL_CURL:
1071           break;  // TODO: Not implemented
1072                   // LCOV_EXCL_STOP
1073       }
1074     }
1075   }
1076   tab.pop();
1077   code << tab << "}\n";
1078   return CEED_ERROR_SUCCESS;
1079 }
1080 
1081 //------------------------------------------------------------------------------
1082 // Build single operator kernel
1083 //------------------------------------------------------------------------------
1084 extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
1085   bool                    is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
1086   Ceed                    ceed;
1087   CeedInt                 Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1088   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
1089   CeedQFunction_Cuda_gen *qf_data;
1090   CeedQFunction           qf;
1091   CeedOperatorField      *op_input_fields, *op_output_fields;
1092   CeedOperator_Cuda_gen  *data;
1093   std::ostringstream      code;
1094   Tab                     tab;
1095 
1096   CeedCallBackend(CeedOperatorGetData(op, &data));
1097   {
1098     bool is_setup_done;
1099 
1100     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
1101     if (is_setup_done) {
1102       *is_good_build = !data->use_fallback;
1103       return CEED_ERROR_SUCCESS;
1104     }
1105   }
1106 
1107   // Check field compatibility
1108   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1109   {
1110     bool has_shared_bases = true;
1111 
1112     for (CeedInt i = 0; i < num_input_fields; i++) {
1113       CeedBasis basis;
1114 
1115       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1116       if (basis != CEED_BASIS_NONE) {
1117         bool        is_tensor = true;
1118         const char *resource;
1119         char       *resource_root;
1120         Ceed        basis_ceed;
1121 
1122         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1123         is_all_tensor    = is_all_tensor && is_tensor;
1124         is_all_nontensor = is_all_nontensor && !is_tensor;
1125         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1126         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1127         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1128         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
1129         CeedCallBackend(CeedFree(&resource_root));
1130         CeedCallBackend(CeedDestroy(&basis_ceed));
1131       }
1132       CeedCallBackend(CeedBasisDestroy(&basis));
1133     }
1134 
1135     for (CeedInt i = 0; i < num_output_fields; i++) {
1136       CeedBasis basis;
1137 
1138       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1139       if (basis != CEED_BASIS_NONE) {
1140         bool        is_tensor = true;
1141         const char *resource;
1142         char       *resource_root;
1143         Ceed        basis_ceed;
1144 
1145         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1146         is_all_tensor    = is_all_tensor && is_tensor;
1147         is_all_nontensor = is_all_nontensor && !is_tensor;
1148 
1149         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1150         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1151         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1152         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
1153         CeedCallBackend(CeedFree(&resource_root));
1154         CeedCallBackend(CeedDestroy(&basis_ceed));
1155       }
1156       CeedCallBackend(CeedBasisDestroy(&basis));
1157     }
1158     // -- Fallback to ref if not all bases are shared
1159     if (!has_shared_bases) {
1160       *is_good_build = false;
1161       return CEED_ERROR_SUCCESS;
1162     }
1163   }
1164   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1165   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1166   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1167   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1168 
1169   // Get operator data
1170   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1171   {
1172     CeedInt max_P = 0, max_P_1d = 0;
1173 
1174     CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
1175                                                          op_output_fields, qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor,
1176                                                          &use_3d_slices));
1177     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
1178   }
1179   if (max_dim == 0) max_dim = 1;
1180   data->dim = max_dim;
1181   if (is_at_points) {
1182     CeedElemRestriction_Cuda *rstr_data;
1183     CeedElemRestriction       rstr_points = NULL;
1184 
1185     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1186     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1187     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1188     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1189     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1190     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1191   }
1192   if (is_at_points) use_3d_slices = false;
1193   if (Q_1d == 0) {
1194     if (is_at_points) Q_1d = max_num_points;
1195     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1196   }
1197   if (Q == 0) Q = Q_1d;
1198   data->Q    = Q;
1199   data->Q_1d = Q_1d;
1200 
1201   // Check for restriction only identity operator
1202   {
1203     bool is_identity_qf;
1204 
1205     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1206     if (is_identity_qf) {
1207       CeedEvalMode eval_mode_in, eval_mode_out;
1208 
1209       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1210       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1211       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1212                 "Backend does not implement restriction only identity operators");
1213     }
1214   }
1215 
1216   // Add atomicAdd function for old NVidia architectures
1217   {
1218     Ceed_Cuda            *ceed_data;
1219     struct cudaDeviceProp prop;
1220 
1221     CeedCallBackend(CeedGetData(ceed, &ceed_data));
1222     CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
1223     if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
1224       code << tab << "// AtomicAdd fallback source\n";
1225       code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
1226     }
1227   }
1228 
1229   // Load basis source files
1230   if (!is_all_nontensor) {
1231     code << tab << "// Tensor basis source\n";
1232     code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
1233   }
1234   if (!is_all_tensor) {
1235     code << tab << "// Non-tensor basis source\n";
1236     code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
1237   }
1238   if (!is_all_tensor && !is_all_nontensor) {
1239     code << "// Tensor basis source\n";
1240     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h>\n\n";
1241   }
1242   if (is_at_points) {
1243     code << "// AtPoints basis source\n";
1244     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
1245   }
1246   code << "// CodeGen operator source\n";
1247   code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
1248 
1249   // Get QFunction name
1250   std::string qfunction_name(qf_data->qfunction_name);
1251   std::string operator_name;
1252 
1253   operator_name = "CeedKernelCudaGenOperator_" + qfunction_name;
1254 
1255   // Define CEED_Q_VLA
1256   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1257   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1258     code << tab << "#define CEED_Q_VLA 1\n\n";
1259   } else {
1260     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1261   }
1262 
1263   // Add user QFunction source
1264   {
1265     const char *source_path;
1266 
1267     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1268     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
1269 
1270     code << tab << "// User QFunction source\n";
1271     code << tab << "#include \"" << source_path << "\"\n\n";
1272   }
1273 
1274   // Setup
1275   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1276   code << tab << "// Operator Kernel\n";
1277   code << tab << "// \n";
1278   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1279   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1280   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1281   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1282   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1283   code << tab << "// \n";
1284   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1285   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1286   code << tab << "// -----------------------------------------------------------------------------\n";
1287   code << tab << "extern \"C\" __global__ void " << operator_name
1288        << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
1289           "points) {\n";
1290   tab.push();
1291 
1292   // Scratch buffers
1293   for (CeedInt i = 0; i < num_input_fields; i++) {
1294     CeedEvalMode eval_mode;
1295 
1296     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1297     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1298       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1299     }
1300   }
1301   for (CeedInt i = 0; i < num_output_fields; i++) {
1302     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1303   }
1304 
1305   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1306   if (!is_all_tensor) {
1307     code << tab << "const CeedInt Q = " << Q << ";\n";
1308   }
1309   if (!is_all_nontensor) {
1310     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1311   }
1312   if (is_at_points) {
1313     code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1314     code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1315   }
1316 
1317   // Shared data
1318   code << tab << "extern __shared__ CeedScalar slice[];\n";
1319   code << tab << "SharedData_Cuda data;\n";
1320   code << tab << "data.t_id_x = threadIdx.x;\n";
1321   code << tab << "data.t_id_y = threadIdx.y;\n";
1322   code << tab << "data.t_id_z = threadIdx.z;\n";
1323   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1324   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1325 
1326   // -- Determine input mat reuse
1327   FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
1328 
1329   for (CeedInt i = 0; i < num_input_fields; i++) {
1330     input_matrix_reuse[i].index = -1;
1331   }
1332   for (CeedInt i = 0; i < num_input_fields; i++) {
1333     bool         is_tensor = true;
1334     CeedEvalMode eval_mode_i;
1335     CeedBasis    basis_i;
1336 
1337     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1338     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1339     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1340     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1341     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1342       CeedEvalMode eval_mode_j;
1343       CeedBasis    basis_j;
1344 
1345       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1346       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1347       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1348       if (basis_i == basis_j) {
1349         if (is_tensor) {
1350           input_matrix_reuse[i].index     = j;
1351           input_matrix_reuse[i].is_input  = true;
1352           input_matrix_reuse[i].eval_mode = eval_mode_j;
1353         } else {
1354           // For non-tensor can only re-use with the same eval mode
1355           if (eval_mode_i == eval_mode_j) {
1356             input_matrix_reuse[i].index     = j;
1357             input_matrix_reuse[i].is_input  = true;
1358             input_matrix_reuse[i].eval_mode = eval_mode_j;
1359           }
1360         }
1361       }
1362       CeedCallBackend(CeedBasisDestroy(&basis_j));
1363     }
1364     CeedCallBackend(CeedBasisDestroy(&basis_i));
1365   }
1366 
1367   // -- Determine output mat reuse
1368   FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX];
1369 
1370   for (CeedInt i = 0; i < num_output_fields; i++) {
1371     output_matrix_reuse[i].index = -1;
1372   }
1373   for (CeedInt i = 0; i < num_output_fields; i++) {
1374     bool         is_tensor = true;
1375     CeedEvalMode eval_mode_i;
1376     CeedBasis    basis_i;
1377 
1378     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1379     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1380     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1381     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1382       CeedEvalMode eval_mode_j;
1383       CeedBasis    basis_j;
1384 
1385       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1386       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1387       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1388       if (basis_i == basis_j) {
1389         if (is_tensor) {
1390           output_matrix_reuse[i].index     = j;
1391           output_matrix_reuse[i].is_input  = true;
1392           output_matrix_reuse[i].eval_mode = eval_mode_j;
1393         } else {
1394           // For non-tensor can only re-use with the same eval mode
1395           if (eval_mode_i == eval_mode_j) {
1396             output_matrix_reuse[i].index     = j;
1397             output_matrix_reuse[i].is_input  = true;
1398             output_matrix_reuse[i].eval_mode = eval_mode_j;
1399           }
1400         }
1401       }
1402       CeedCallBackend(CeedBasisDestroy(&basis_j));
1403     }
1404     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1405       CeedEvalMode eval_mode_j;
1406       CeedBasis    basis_j;
1407 
1408       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1409       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1410       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1411       if (basis_i == basis_j) {
1412         if (is_tensor) {
1413           output_matrix_reuse[i].index     = j;
1414           output_matrix_reuse[i].is_input  = false;
1415           output_matrix_reuse[i].eval_mode = eval_mode_j;
1416         } else {
1417           // For non-tensor can only re-use with the same eval mode
1418           if (eval_mode_i == eval_mode_j) {
1419             output_matrix_reuse[i].index     = j;
1420             output_matrix_reuse[i].is_input  = false;
1421             output_matrix_reuse[i].eval_mode = eval_mode_j;
1422           }
1423         }
1424       }
1425       CeedCallBackend(CeedBasisDestroy(&basis_j));
1426     }
1427     CeedCallBackend(CeedBasisDestroy(&basis_i));
1428   }
1429 
1430   // Initialize constants, and matrices B and G
1431   code << "\n" << tab << "// Input field constants and basis data\n";
1432   for (CeedInt i = 0; i < num_input_fields; i++) {
1433     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1434                                                               max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1435   }
1436   code << "\n" << tab << "// Output field constants and basis data\n";
1437   for (CeedInt i = 0; i < num_output_fields; i++) {
1438     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1439                                                               max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1440   }
1441 
1442   // Loop over all elements
1443   code << "\n" << tab << "// Element loop\n";
1444   code << tab << "__syncthreads();\n";
1445   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1446   tab.push();
1447 
1448   // -- Compute minimum buffer space needed
1449   CeedInt max_rstr_buffer_size = 1;
1450 
1451   for (CeedInt i = 0; i < num_input_fields; i++) {
1452     CeedEvalMode eval_mode;
1453 
1454     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1455     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1456       CeedInt             num_comp;
1457       CeedElemRestriction elem_rstr;
1458 
1459       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1460       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1461       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1462       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1463     }
1464   }
1465   for (CeedInt i = 0; i < num_output_fields; i++) {
1466     CeedEvalMode eval_mode;
1467 
1468     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1469     if (eval_mode != CEED_EVAL_NONE) {
1470       CeedInt             num_comp;
1471       CeedElemRestriction elem_rstr;
1472 
1473       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1474       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1475       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1476       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1477     }
1478   }
1479   code << tab << "// Scratch restriction buffer space\n";
1480   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1481 
1482   // -- Determine best input field processing order
1483   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1484 
1485   for (CeedInt i = 0; i < num_input_fields; i++) {
1486     field_rstr_in_buffer[i] = -1;
1487     input_field_order[i]    = -1;
1488   }
1489   {
1490     bool    is_ordered[CEED_FIELD_MAX];
1491     CeedInt curr_index = 0;
1492 
1493     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1494     for (CeedInt i = 0; i < num_input_fields; i++) {
1495       CeedVector          vec_i;
1496       CeedElemRestriction rstr_i;
1497 
1498       if (is_ordered[i]) continue;
1499       field_rstr_in_buffer[i]       = i;
1500       is_ordered[i]                 = true;
1501       input_field_order[curr_index] = i;
1502       curr_index++;
1503       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1504       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1505       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1506       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1507         CeedVector          vec_j;
1508         CeedElemRestriction rstr_j;
1509 
1510         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1511         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1512         if (rstr_i == rstr_j && vec_i == vec_j) {
1513           field_rstr_in_buffer[j]       = i;
1514           is_ordered[j]                 = true;
1515           input_field_order[curr_index] = j;
1516           curr_index++;
1517         }
1518         CeedCallBackend(CeedVectorDestroy(&vec_j));
1519         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1520       }
1521       CeedCallBackend(CeedVectorDestroy(&vec_i));
1522       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1523     }
1524   }
1525 
1526   // -- Input restriction and basis
1527   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1528   for (CeedInt i = 0; i < num_input_fields; i++) {
1529     const char   *field_name;
1530     const CeedInt f = input_field_order[i];
1531 
1532     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1533     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1534 
1535     // ---- Restriction
1536     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1537                                                                 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1538 
1539     // ---- Basis action
1540     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1541                                                           is_all_tensor, is_at_points, use_3d_slices));
1542   }
1543 
1544   // -- Q function
1545   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1546                                                             qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1547                                                             Q_1d, is_all_tensor, is_at_points, use_3d_slices));
1548 
1549   // -- Output basis and restriction
1550   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
1551   for (CeedInt i = 0; i < num_output_fields; i++) {
1552     const char *field_name;
1553 
1554     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1555     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1556 
1557     // ---- Basis action
1558     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
1559                                                           is_all_tensor, is_at_points, use_3d_slices));
1560 
1561     // ---- Restriction
1562     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
1563                                                                 false, is_all_tensor, is_at_points, use_3d_slices));
1564   }
1565 
1566   // Close loop and function
1567   tab.pop();
1568   code << tab << "}\n";
1569   tab.pop();
1570   code << tab << "}\n";
1571   code << tab << "// -----------------------------------------------------------------------------\n\n";
1572 
1573   // Compile
1574   {
1575     bool          is_compile_good = false;
1576     const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
1577 
1578     data->thread_1d = T_1d;
1579     CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
1580     if (is_compile_good) {
1581       *is_good_build = true;
1582       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
1583     } else {
1584       *is_good_build     = false;
1585       data->use_fallback = true;
1586     }
1587   }
1588   CeedCallBackend(CeedOperatorSetSetupDone(op));
1589   CeedCallBackend(CeedDestroy(&ceed));
1590   CeedCallBackend(CeedQFunctionDestroy(&qf));
1591   return CEED_ERROR_SUCCESS;
1592 }
1593 
1594 //------------------------------------------------------------------------------
1595 // Build AtPoints assembly operator kernel
1596 //------------------------------------------------------------------------------
1597 static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, bool is_full, bool *is_good_build) {
1598   bool                    is_all_tensor = true, is_at_points = false, use_3d_slices = false;
1599   Ceed                    ceed;
1600   CeedInt                 Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1601   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
1602   CeedQFunction_Cuda_gen *qf_data;
1603   CeedQFunction           qf;
1604   CeedOperatorField      *op_input_fields, *op_output_fields;
1605   CeedOperator_Cuda_gen  *data;
1606   std::ostringstream      code;
1607   Tab                     tab;
1608 
1609   // Check compatibility
1610   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1611   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1612   CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
1613 
1614   // Retrieve operator data
1615   CeedCallBackend(CeedOperatorGetData(op, &data));
1616   Q       = data->Q;
1617   Q_1d    = data->Q_1d;
1618   max_dim = data->dim;
1619   {
1620     CeedElemRestriction rstr_points = NULL;
1621 
1622     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1623     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1624     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1625     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1626   }
1627   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1628   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1629   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1630   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1631 
1632   // Add atomicAdd function for old NVidia architectures
1633   {
1634     Ceed_Cuda            *ceed_data;
1635     struct cudaDeviceProp prop;
1636 
1637     CeedCallBackend(CeedGetData(ceed, &ceed_data));
1638     CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
1639     if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
1640       code << tab << "// AtomicAdd fallback source\n";
1641       code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
1642     }
1643   }
1644 
1645   // Load basis source files
1646   code << tab << "// Tensor basis source\n";
1647   code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
1648   code << tab << "// AtPoints basis source\n";
1649   code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
1650   code << tab << "// CodeGen operator source\n";
1651   code << tab << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
1652 
1653   // Get QFunction name
1654   std::string qfunction_name(qf_data->qfunction_name);
1655   std::string operator_name;
1656 
1657   if (is_full) {
1658     operator_name = "CeedKernelCudaGenOperatorFullAssembly_" + qfunction_name;
1659   } else {
1660     operator_name = "CeedKernelCudaGenOperatorDiagonalAssembly_" + qfunction_name;
1661   }
1662 
1663   // Define CEED_Q_VLA
1664   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1665   code << tab << "#define CEED_Q_VLA 1\n\n";
1666 
1667   // Add user QFunction source
1668   {
1669     const char *source_path;
1670 
1671     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1672     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
1673 
1674     code << tab << "// User QFunction source\n";
1675     code << tab << "#include \"" << source_path << "\"\n\n";
1676   }
1677 
1678   // Setup
1679   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1680   code << tab << "// Operator Assembly Kernel\n";
1681   code << tab << "// \n";
1682   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1683   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1684   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1685   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1686   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1687   code << tab << "// \n";
1688   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1689   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1690   code << tab << "// -----------------------------------------------------------------------------\n";
1691   code << tab << "extern \"C\" __global__ void " << operator_name
1692        << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
1693           "points, CeedScalar *__restrict__ values_array) {\n";
1694   tab.push();
1695 
1696   // Scratch buffers
1697   for (CeedInt i = 0; i < num_input_fields; i++) {
1698     CeedEvalMode eval_mode;
1699 
1700     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1701     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1702       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1703     }
1704   }
1705   for (CeedInt i = 0; i < num_output_fields; i++) {
1706     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1707   }
1708 
1709   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1710   code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1711   code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1712   code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1713 
1714   // Shared data
1715   code << tab << "extern __shared__ CeedScalar slice[];\n";
1716   code << tab << "SharedData_Cuda data;\n";
1717   code << tab << "data.t_id_x = threadIdx.x;\n";
1718   code << tab << "data.t_id_y = threadIdx.y;\n";
1719   code << tab << "data.t_id_z = threadIdx.z;\n";
1720   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1721   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1722 
1723   // -- Determine input mat reuse
1724   FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
1725 
1726   for (CeedInt i = 0; i < num_input_fields; i++) {
1727     input_matrix_reuse[i].index = -1;
1728   }
1729   for (CeedInt i = 0; i < num_input_fields; i++) {
1730     CeedEvalMode eval_mode_i;
1731     CeedBasis    basis_i;
1732 
1733     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1734     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1735     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1736     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1737       CeedEvalMode eval_mode_j;
1738       CeedBasis    basis_j;
1739 
1740       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1741       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1742       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1743       if (basis_i == basis_j) {
1744         input_matrix_reuse[i].index     = j;
1745         input_matrix_reuse[i].is_input  = true;
1746         input_matrix_reuse[i].eval_mode = eval_mode_j;
1747       }
1748       CeedCallBackend(CeedBasisDestroy(&basis_j));
1749     }
1750     CeedCallBackend(CeedBasisDestroy(&basis_i));
1751   }
1752 
1753   // -- Determine output mat reuse
1754   FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX];
1755 
1756   for (CeedInt i = 0; i < num_output_fields; i++) {
1757     output_matrix_reuse[i].index = -1;
1758   }
1759   for (CeedInt i = 0; i < num_output_fields; i++) {
1760     CeedEvalMode eval_mode_i;
1761     CeedBasis    basis_i;
1762 
1763     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1764     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1765     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1766       CeedEvalMode eval_mode_j;
1767       CeedBasis    basis_j;
1768 
1769       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1770       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1771       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1772       if (basis_i == basis_j) {
1773         output_matrix_reuse[i].index     = j;
1774         output_matrix_reuse[i].is_input  = true;
1775         output_matrix_reuse[i].eval_mode = eval_mode_j;
1776       }
1777       CeedCallBackend(CeedBasisDestroy(&basis_j));
1778     }
1779     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1780       CeedEvalMode eval_mode_j;
1781       CeedBasis    basis_j;
1782 
1783       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1784       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1785       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1786       if (basis_i == basis_j) {
1787         output_matrix_reuse[i].index     = j;
1788         output_matrix_reuse[i].is_input  = false;
1789         output_matrix_reuse[i].eval_mode = eval_mode_j;
1790       }
1791       CeedCallBackend(CeedBasisDestroy(&basis_j));
1792     }
1793     CeedCallBackend(CeedBasisDestroy(&basis_i));
1794   }
1795 
1796   // Initialize constants, and matrices B and G
1797   code << "\n" << tab << "// Input field constants and basis data\n";
1798   for (CeedInt i = 0; i < num_input_fields; i++) {
1799     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1800                                                               max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1801   }
1802   code << "\n" << tab << "// Output field constants and basis data\n";
1803   for (CeedInt i = 0; i < num_output_fields; i++) {
1804     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1805                                                               max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1806   }
1807 
1808   // Loop over all elements
1809   code << "\n" << tab << "// Element loop\n";
1810   code << tab << "__syncthreads();\n";
1811   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1812   tab.push();
1813 
1814   // -- Compute minimum buffer space needed
1815   CeedInt max_rstr_buffer_size = 1;
1816 
1817   for (CeedInt i = 0; i < num_input_fields; i++) {
1818     CeedEvalMode eval_mode;
1819 
1820     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1821     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1822       CeedInt             num_comp;
1823       CeedElemRestriction elem_rstr;
1824 
1825       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1826       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1827       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1828       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1829     }
1830   }
1831   for (CeedInt i = 0; i < num_output_fields; i++) {
1832     CeedEvalMode eval_mode;
1833 
1834     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1835     if (eval_mode != CEED_EVAL_NONE) {
1836       CeedInt             num_comp;
1837       CeedElemRestriction elem_rstr;
1838 
1839       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1840       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1841       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1842       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1843     }
1844   }
1845   code << tab << "// Scratch restriction buffer space\n";
1846   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1847 
1848   // -- Determine best input field processing order
1849   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1850 
1851   for (CeedInt i = 0; i < num_input_fields; i++) {
1852     field_rstr_in_buffer[i] = -1;
1853     input_field_order[i]    = -1;
1854   }
1855   {
1856     bool    is_ordered[CEED_FIELD_MAX];
1857     CeedInt curr_index = 0;
1858 
1859     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1860     for (CeedInt i = 0; i < num_input_fields; i++) {
1861       CeedVector          vec_i;
1862       CeedElemRestriction rstr_i;
1863 
1864       if (is_ordered[i]) continue;
1865       field_rstr_in_buffer[i]       = i;
1866       is_ordered[i]                 = true;
1867       input_field_order[curr_index] = i;
1868       curr_index++;
1869       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1870       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1871       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1872       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1873         CeedVector          vec_j;
1874         CeedElemRestriction rstr_j;
1875 
1876         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1877         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1878         if (rstr_i == rstr_j && vec_i == vec_j) {
1879           field_rstr_in_buffer[j]       = i;
1880           is_ordered[j]                 = true;
1881           input_field_order[curr_index] = j;
1882           curr_index++;
1883         }
1884         CeedCallBackend(CeedVectorDestroy(&vec_j));
1885         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1886       }
1887       CeedCallBackend(CeedVectorDestroy(&vec_i));
1888       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1889     }
1890   }
1891 
1892   // -- Input restriction and basis
1893   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1894   CeedInt active_field_index = -1;
1895 
1896   for (CeedInt i = 0; i < num_input_fields; i++) {
1897     bool          is_active = false;
1898     const char   *field_name;
1899     const CeedInt f = input_field_order[i];
1900 
1901     {
1902       CeedVector vec;
1903 
1904       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1905       is_active = vec == CEED_VECTOR_ACTIVE;
1906       CeedCallBackend(CeedVectorDestroy(&vec));
1907     }
1908 
1909     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1910     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1911 
1912     if (is_active) {
1913       std::string var_suffix = "_in_" + std::to_string(f);
1914 
1915       code << tab << "// Active field - no restriction or basis action here\n";
1916       if (active_field_index == -1) {
1917         active_field_index = f;
1918         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
1919              << "] = {0.0};\n";
1920       } else {
1921         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
1922       }
1923     } else {
1924       // ---- Restriction
1925       CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1926                                                                   max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1927 
1928       // ---- Basis action
1929       CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1930                                                             is_all_tensor, is_at_points, use_3d_slices));
1931     }
1932   }
1933 
1934   // -- Loop over active field
1935   std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
1936 
1937   code << "\n" << tab << "// Loop over nodes in active field\n";
1938   code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
1939        << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
1940   tab.push();
1941 
1942   // -- Set current active node and component to 1
1943   code << tab << "// Set current active node and component to 1.0\n";
1944   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
1945        << active_var_suffix << ");\n\n";
1946 
1947   for (CeedInt i = 0; i < num_input_fields; i++) {
1948     bool          is_active = false;
1949     const char   *field_name;
1950     const CeedInt f = input_field_order[i];
1951 
1952     {
1953       CeedVector vec;
1954 
1955       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1956       is_active = vec == CEED_VECTOR_ACTIVE;
1957       CeedCallBackend(CeedVectorDestroy(&vec));
1958     }
1959     if (!is_active) continue;
1960 
1961     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1962     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1963 
1964     // ---- Basis action
1965     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1966                                                           is_all_tensor, is_at_points, use_3d_slices));
1967   }
1968 
1969   // -- Q function
1970   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1971                                                             qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1972                                                             Q_1d, is_all_tensor, is_at_points, use_3d_slices));
1973 
1974   // -- Output basis and restriction
1975   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
1976   for (CeedInt i = 0; i < num_output_fields; i++) {
1977     bool        is_active = false;
1978     const char *field_name;
1979 
1980     {
1981       CeedVector vec;
1982 
1983       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
1984       is_active = vec == CEED_VECTOR_ACTIVE;
1985       CeedCallBackend(CeedVectorDestroy(&vec));
1986     }
1987     if (!is_active) continue;
1988 
1989     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1990     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1991 
1992     // ---- Basis action
1993     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
1994                                                           is_all_tensor, is_at_points, use_3d_slices));
1995 
1996     // ---- Restriction
1997     if (is_full) {
1998       // TODO: UPDATE OUTPUTS FOR FULL ASSEMBLY
1999     } else {
2000       std::string         var_suffix = "_out_" + std::to_string(i);
2001       CeedInt             comp_stride;
2002       CeedSize            l_size;
2003       CeedElemRestriction elem_rstr;
2004 
2005       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2006       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2007       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2008       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2009       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2010       code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2011            << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
2012       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2013     }
2014   }
2015 
2016   // -- Reset current active node and component
2017   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2018   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
2019        << active_var_suffix << ");\n";
2020 
2021   // -- End of loop over active field
2022   tab.pop();
2023   code << tab << "}\n";
2024 
2025   // Close loop and function
2026   tab.pop();
2027   code << tab << "}\n";
2028   tab.pop();
2029   code << tab << "}\n";
2030   code << tab << "// -----------------------------------------------------------------------------\n\n";
2031 
2032   // Compile
2033   {
2034     bool          is_compile_good = false;
2035     const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
2036 
2037     data->thread_1d = T_1d;
2038     CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good,
2039                                         is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d));
2040     if (is_compile_good) {
2041       *is_good_build = true;
2042       CeedCallBackend(CeedGetKernel_Cuda(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
2043                                          is_full ? &data->assemble_full : &data->assemble_diagonal));
2044     } else {
2045       *is_good_build              = false;
2046       data->use_assembly_fallback = true;
2047     }
2048   }
2049   CeedCallBackend(CeedDestroy(&ceed));
2050   CeedCallBackend(CeedQFunctionDestroy(&qf));
2051   return CEED_ERROR_SUCCESS;
2052 }
2053 
2054 extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build) {
2055   return CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(op, false, is_good_build);
2056 }
2057 
2058 //------------------------------------------------------------------------------
2059