xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 0a2a64927a7a47782a31ad89eee1373d25546e0c)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/jit-tools.h>
13 #include <cuda_runtime.h>
14 
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 
19 #include "../cuda-ref/ceed-cuda-ref.h"
20 #include "../cuda-shared/ceed-cuda-shared.h"
21 #include "../cuda/ceed-cuda-common.h"
22 #include "../cuda/ceed-cuda-compile.h"
23 #include "ceed-cuda-gen.h"
24 
25 //------------------------------------------------------------------------------
26 // Determine type of operator
27 //------------------------------------------------------------------------------
28 static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
29                                                 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
30                                                 CeedQFunctionField *qf_output_fields, CeedInt *max_P_1d, CeedInt *Q_1d, CeedInt *dim, bool *is_tensor,
31                                                 bool *use_3d_slices) {
32   // Find dim, P_1d, Q_1d
33   *max_P_1d  = 0;
34   *Q_1d      = 0;
35   *dim       = 0;
36   *is_tensor = true;
37 
38   for (CeedInt i = 0; i < num_input_fields; i++) {
39     CeedBasis basis;
40 
41     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
42     if (basis != CEED_BASIS_NONE) {
43       bool    is_field_tensor;
44       CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
45 
46       // Collect dim, P_1d, and Q_1d
47       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
48       *is_tensor = *is_tensor && is_field_tensor;
49       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
50       else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
51       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
52       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
53       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
54       *dim = field_dim;
55       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
56       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
57       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
58       *Q_1d = field_Q_1d;
59     }
60     CeedCallBackend(CeedBasisDestroy(&basis));
61   }
62   for (CeedInt i = 0; i < num_output_fields; i++) {
63     CeedBasis basis;
64 
65     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
66     if (basis != CEED_BASIS_NONE) {
67       bool    is_field_tensor;
68       CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
69 
70       // Collect dim, P_1d, and Q_1d
71       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
72       *is_tensor = *is_tensor && is_field_tensor;
73       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
74       else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
75       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
76       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
77       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
78       *dim = field_dim;
79       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
80       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
81       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
82       *Q_1d = field_Q_1d;
83     }
84     CeedCallBackend(CeedBasisDestroy(&basis));
85   }
86 
87   // Only use 3D collocated gradient parallelization strategy when gradient is computed
88   *use_3d_slices = false;
89   if (*dim == 3) {
90     bool was_grad_found = false;
91 
92     for (CeedInt i = 0; i < num_input_fields; i++) {
93       CeedEvalMode eval_mode;
94 
95       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
96       if (eval_mode == CEED_EVAL_GRAD) {
97         CeedBasis_Cuda_shared *basis_data;
98         CeedBasis              basis;
99 
100         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
101         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
102         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
103         was_grad_found = true;
104         CeedCallBackend(CeedBasisDestroy(&basis));
105       }
106     }
107     for (CeedInt i = 0; i < num_output_fields; i++) {
108       CeedEvalMode eval_mode;
109 
110       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
111       if (eval_mode == CEED_EVAL_GRAD) {
112         CeedBasis_Cuda_shared *basis_data;
113         CeedBasis              basis;
114 
115         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
116         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
117         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
118         was_grad_found = true;
119         CeedCallBackend(CeedBasisDestroy(&basis));
120       }
121     }
122   }
123   return CEED_ERROR_SUCCESS;
124 }
125 
126 //------------------------------------------------------------------------------
127 // Setup fields
128 //------------------------------------------------------------------------------
129 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
130                                                      CeedQFunctionField qf_field, CeedInt field_reuse[3], CeedInt Q_1d, bool is_input, bool is_tensor,
131                                                      bool is_at_points, bool use_3d_slices) {
132   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
133   std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
134   std::string            option_name = (is_input ? "inputs" : "outputs");
135   CeedEvalMode           eval_mode   = CEED_EVAL_NONE;
136   CeedInt                elem_size = 0, num_comp = 0, P_1d = 0;
137   CeedElemRestriction    elem_rstr;
138   CeedBasis_Cuda_shared *basis_data;
139   CeedBasis              basis;
140 
141   // Field reuse info
142   bool         use_previous_field = field_reuse[0] != -1;
143   bool         reuse_input        = field_reuse[1];
144   CeedInt      reuse_field        = field_reuse[0];
145   CeedEvalMode reuse_mode         = (CeedEvalMode)field_reuse[2];
146 
147   code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
148 
149   // Get field data
150   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
151   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
152     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
153     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
154   }
155   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
156   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
157   if (basis != CEED_BASIS_NONE) {
158     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
159     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
160     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
161   }
162   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
163 
164   // Set field constants
165   if (eval_mode != CEED_EVAL_WEIGHT) {
166     code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
167     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
168   }
169 
170   // Load basis data
171   code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
172   switch (eval_mode) {
173     case CEED_EVAL_NONE:
174       break;
175     case CEED_EVAL_INTERP:
176       if (is_at_points) {
177         // AtPoints
178         if (!basis_data->d_chebyshev_interp_1d) {
179           CeedSize    interp_bytes;
180           CeedScalar *chebyshev_interp_1d;
181 
182           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
183           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
184           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
185           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
186           CeedCallCuda(CeedBasisReturnCeed(basis),
187                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
188           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
189         }
190         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
191         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
192       } else {
193         // Standard quadrature
194         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
195         else data->B.outputs[i] = basis_data->d_interp_1d;
196       }
197       if (use_previous_field) {
198         std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
199 
200         code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
201       } else {
202         code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
203         code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
204       }
205       break;
206     case CEED_EVAL_GRAD:
207       if (is_at_points) {
208         // AtPoints
209         if (!basis_data->d_chebyshev_interp_1d) {
210           CeedSize    interp_bytes;
211           CeedScalar *chebyshev_interp_1d;
212 
213           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
214           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
215           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
216           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
217           CeedCallCuda(CeedBasisReturnCeed(basis),
218                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
219           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
220         }
221         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
222         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
223       } else {
224         // Standard quadrature
225         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
226         else data->B.outputs[i] = basis_data->d_interp_1d;
227       }
228       if (is_tensor) {
229         if (use_previous_field) {
230           std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
231 
232           code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
233         } else {
234           code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
235           code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
236         }
237       }
238       if (is_at_points) break;  // No G mat for AtPoints
239       if (use_3d_slices) {
240         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
241         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
242         if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
243           std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
244 
245           code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
246         } else {
247           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
248           code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
249         }
250       } else {
251         bool has_collo_grad = basis_data->d_collo_grad_1d;
252 
253         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
254         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
255         if (has_collo_grad) {
256           if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
257             std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
258 
259             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
260           } else {
261             code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
262             code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
263           }
264         } else {
265           if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
266             std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));
267 
268             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
269           } else {
270             code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
271             code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
272                  << var_suffix << ");\n";
273           }
274         }
275       }
276       break;
277     case CEED_EVAL_WEIGHT:
278       break;  // No action
279       // LCOV_EXCL_START
280     case CEED_EVAL_DIV:
281     case CEED_EVAL_CURL:
282       break;  // TODO: Not implemented
283               // LCOV_EXCL_STOP
284   }
285   CeedCallBackend(CeedBasisDestroy(&basis));
286   return CEED_ERROR_SUCCESS;
287 }
288 
289 //------------------------------------------------------------------------------
290 // Restriction
291 //------------------------------------------------------------------------------
292 static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
293                                                        CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
294                                                        CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points, bool use_3d_slices) {
295   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
296   std::string               P_name     = (is_tensor ? "P_1d" : "P") + var_suffix;
297   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
298   CeedInt                   elem_size = 0, num_comp = 0, P_1d = 0;
299   CeedSize                  l_size;
300   CeedRestrictionType       rstr_type = CEED_RESTRICTION_STANDARD;
301   CeedElemRestriction_Cuda *rstr_data;
302   CeedElemRestriction       elem_rstr;
303   CeedBasis                 basis;
304 
305   // Get field data
306   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
307   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
308     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
309     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
310     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
311     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
312   }
313   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
314   if (basis != CEED_BASIS_NONE) {
315     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
316     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
317   }
318   CeedCallBackend(CeedBasisDestroy(&basis));
319   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
320 
321   // Restriction
322   if (is_input) {
323     // Input
324     if (field_input_buffer[i] != i) {
325       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
326 
327       // Restriction was already done for previous input
328       code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
329     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
330       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
331         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
332         code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
333       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
334         // Otherwise we're using the scratch space
335         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
336       }
337       switch (rstr_type) {
338         case CEED_RESTRICTION_STANDARD: {
339           CeedInt comp_stride;
340 
341           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
342           code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
343           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
344           code << "    // CompStride: " << comp_stride << "\n";
345           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
346           code << "    ReadLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
347                << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
348           break;
349         }
350         case CEED_RESTRICTION_STRIDED: {
351           bool    has_backend_strides;
352           CeedInt num_elem;
353 
354           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
355           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
356           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
357 
358           if (!has_backend_strides) {
359             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
360           }
361           code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
362           code << "    ReadLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
363                << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
364           break;
365         }
366         case CEED_RESTRICTION_POINTS: {
367           CeedInt comp_stride;
368 
369           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
370           code << "    const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
371           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
372           break;
373         }
374         // LCOV_EXCL_START
375         case CEED_RESTRICTION_ORIENTED:
376         case CEED_RESTRICTION_CURL_ORIENTED:
377           break;  // TODO: Not implemented
378                   // LCOV_EXCL_STOP
379       }
380     }
381   } else {
382     // Output
383     switch (rstr_type) {
384       case CEED_RESTRICTION_STANDARD: {
385         CeedInt comp_stride;
386 
387         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
388         code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
389         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
390         code << "    // CompStride: " << comp_stride << "\n";
391         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
392         code << "    WriteLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
393              << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
394         break;
395       }
396       case CEED_RESTRICTION_STRIDED: {
397         bool    has_backend_strides;
398         CeedInt num_elem;
399 
400         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
401         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
402         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
403 
404         if (!has_backend_strides) {
405           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
406         }
407         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
408         code << "    WriteLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
409              << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
410         break;
411       }
412       case CEED_RESTRICTION_POINTS:
413         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
414         break;
415       // LCOV_EXCL_START
416       case CEED_RESTRICTION_ORIENTED:
417       case CEED_RESTRICTION_CURL_ORIENTED:
418         break;  // TODO: Not implemented
419                 // LCOV_EXCL_STOP
420     }
421   }
422   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
423   return CEED_ERROR_SUCCESS;
424 }
425 
426 //------------------------------------------------------------------------------
427 // Basis
428 //------------------------------------------------------------------------------
429 static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
430                                                  CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor,
431                                                  bool is_at_points, bool use_3d_slices) {
432   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
433   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
434   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
435   CeedInt             elem_size = 0, num_comp = 0, P_1d = 0;
436   CeedElemRestriction elem_rstr;
437   CeedBasis           basis;
438 
439   // Get field data
440   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
441   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
442     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
443     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
444   }
445   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
446   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
447   if (basis != CEED_BASIS_NONE) {
448     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
449     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
450   }
451   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
452 
453   // Basis
454   code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
455   if (is_input) {
456     switch (eval_mode) {
457       case CEED_EVAL_NONE:
458         if (!use_3d_slices && !is_at_points) {
459           code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
460         }
461         break;
462       case CEED_EVAL_INTERP:
463         if (is_at_points) {
464           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
465 
466           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
467           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
468                << var_suffix << ", r_c" << var_suffix << ");\n";
469         } else {
470           std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d") : "InterpNonTensor";
471 
472           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
473           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
474                << var_suffix << ", r_q" << var_suffix << ");\n";
475         }
476         break;
477       case CEED_EVAL_GRAD:
478         if (is_at_points) {
479           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
480 
481           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
482           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
483                << var_suffix << ", r_c" << var_suffix << ");\n";
484         } else if (use_3d_slices) {
485           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
486 
487           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
488           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
489                << var_suffix << ", r_q" << var_suffix << ");\n";
490         } else if (is_tensor) {
491           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
492           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d";
493 
494           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (dim >= 3 ? Q_name : "1") << "];\n";
495           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
496                << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
497         } else {
498           std::string function_name = "GradNonTensor";
499 
500           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
501           code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix
502                << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
503         }
504         break;
505       case CEED_EVAL_WEIGHT: {
506         if (is_at_points) {
507           code << "    // Nothing to do AtPoints\n";
508         } else {
509           CeedBasis_Cuda_shared *basis_data;
510           std::string            function_name = is_tensor ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d") : "WeightNonTensor";
511 
512           code << "    CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
513           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
514           data->W = basis_data->d_q_weight_1d;
515           code << "    " << function_name << "<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
516         }
517         break;
518       }
519       // LCOV_EXCL_START
520       case CEED_EVAL_DIV:
521       case CEED_EVAL_CURL:
522         break;  // TODO: Not implemented
523                 // LCOV_EXCL_STOP
524     }
525   } else {
526     switch (eval_mode) {
527       case CEED_EVAL_NONE:
528         code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
529         break;  // No action
530       case CEED_EVAL_INTERP:
531         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
532         if (is_at_points) {
533           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
534 
535           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
536                << var_suffix << ", r_e" << var_suffix << ");\n";
537         } else {
538           std::string function_name =
539               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d") : "InterpTransposeNonTensor";
540 
541           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
542                << var_suffix << ", r_e" << var_suffix << ");\n";
543         }
544         break;
545       case CEED_EVAL_GRAD:
546         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
547         if (is_at_points) {
548           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
549 
550           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
551                << var_suffix << ", r_e" << var_suffix << ");\n";
552         } else if (use_3d_slices) {
553           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
554 
555           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
556                << var_suffix << ", r_e" << var_suffix << ");\n";
557         } else if (is_tensor) {
558           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
559           std::string function_name =
560               (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) + std::to_string(dim) + "d";
561 
562           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
563                << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
564         } else {
565           std::string function_name = "GradTransposeNonTensor";
566 
567           code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix
568                << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
569         }
570         break;
571       // LCOV_EXCL_START
572       case CEED_EVAL_WEIGHT:
573         break;  // Should not occur
574       case CEED_EVAL_DIV:
575       case CEED_EVAL_CURL:
576         break;  // TODO: Not implemented
577                 // LCOV_EXCL_STOP
578     }
579   }
580   CeedCallBackend(CeedBasisDestroy(&basis));
581   return CEED_ERROR_SUCCESS;
582 }
583 
584 //------------------------------------------------------------------------------
585 // QFunction
586 //------------------------------------------------------------------------------
587 static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt dim, CeedInt max_num_points,
588                                                      CeedInt num_input_fields, CeedOperatorField *op_input_fields,
589                                                      CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
590                                                      CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
591                                                      std::string qfunction_name, CeedInt Q_1d, bool is_tensor, bool is_at_points,
592                                                      bool use_3d_slices) {
593   std::string         Q_name    = is_tensor ? "Q_1d" : "Q";
594   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
595   CeedElemRestriction elem_rstr;
596 
597   // Setup output arrays
598   code << "\n    // -- Output field setup\n";
599   for (CeedInt i = 0; i < num_output_fields; i++) {
600     std::string var_suffix = "_out_" + std::to_string(i);
601 
602     code << "    // ---- Output field " << i << "\n";
603     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
604     switch (eval_mode) {
605       case CEED_EVAL_NONE:
606         if (is_at_points) {
607           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
608         } else {
609           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
610         }
611         break;
612       case CEED_EVAL_INTERP:
613         if (is_at_points) {
614           // Accumulator for point data
615           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
616           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
617           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
618           code << "    }\n";
619         } else {
620           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
621         }
622         break;
623       case CEED_EVAL_GRAD:
624         if (is_at_points) {
625           // Accumulator for point data
626           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "*dim];\n";
627           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
628           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
629           code << "    }\n";
630         } else if (use_3d_slices) {
631           // Accumulator for gradient slices
632           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
633           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
634           code << "      r_q" << var_suffix << "[i] = 0.0;\n";
635           code << "    }\n";
636         } else {
637           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
638         }
639         break;
640       case CEED_EVAL_WEIGHT:
641         break;
642         // LCOV_EXCL_START
643       case CEED_EVAL_DIV:
644       case CEED_EVAL_CURL:
645         break;  // TODO: Not implemented
646                 // LCOV_EXCL_STOP
647     }
648   }
649 
650   if (is_at_points) {
651     // We need to handle batches of points
652     code << "\n    // Note: Using batches of points\n";
653     code << "    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));\n\n";
654     code << "    #pragma unroll\n";
655     code << "    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {\n";
656     code << "      const CeedInt p = i % max_num_points;\n\n";
657 
658     code << "      // -- Coordinates\n";
659     code << "      CeedScalar r_x[dim];\n";
660     code << "      ReadPoint<dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
661 
662     code << "      // -- Input fields\n";
663     for (CeedInt i = 0; i < num_input_fields; i++) {
664       std::string var_suffix = "_in_" + std::to_string(i);
665 
666       code << "      // ---- Input field " << i << "\n";
667       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
668       // Basis action
669       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
670       switch (eval_mode) {
671         case CEED_EVAL_NONE:
672           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
673           code << "      ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
674                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
675           break;
676         case CEED_EVAL_INTERP:
677           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
678           code << "      InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
679                << ", r_x, r_s" << var_suffix << ");\n";
680           break;
681         case CEED_EVAL_GRAD:
682           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
683           code << "      GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
684                << ", r_x, r_s" << var_suffix << ");\n";
685           break;
686         case CEED_EVAL_WEIGHT:
687           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
688           code << "      r_s" << var_suffix << "[0] = 1.0;\n";
689           break;
690           // LCOV_EXCL_START
691         case CEED_EVAL_DIV:
692         case CEED_EVAL_CURL:
693           break;  // TODO: Not implemented
694                   // LCOV_EXCL_STOP
695       }
696     }
697     code << "\n      // -- Output fields\n";
698     for (CeedInt i = 0; i < num_output_fields; i++) {
699       std::string var_suffix = "_out_" + std::to_string(i);
700 
701       code << "      // ---- Output field " << i << "\n";
702       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
703       // Basis action
704       switch (eval_mode) {
705         case CEED_EVAL_NONE:
706           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
707           break;
708         case CEED_EVAL_INTERP:
709           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
710           break;
711         case CEED_EVAL_GRAD:
712           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
713           break;
714           // LCOV_EXCL_START
715         case CEED_EVAL_WEIGHT:
716           break;  // Should not occur
717         case CEED_EVAL_DIV:
718         case CEED_EVAL_CURL:
719           break;  // TODO: Not implemented
720                   // LCOV_EXCL_STOP
721       }
722     }
723 
724   } else if (use_3d_slices) {
725     // We treat quadrature points per slice in 3d to save registers
726     code << "\n    // Note: Using planes of 3D elements\n";
727     code << "    #pragma unroll\n";
728     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
729     code << "      // -- Input fields\n";
730     for (CeedInt i = 0; i < num_input_fields; i++) {
731       std::string var_suffix = "_in_" + std::to_string(i);
732 
733       code << "      // ---- Input field " << i << "\n";
734       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
735       // Basis action
736       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
737       switch (eval_mode) {
738         case CEED_EVAL_NONE:
739           bool is_strided;
740 
741           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
742 
743           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
744           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
745           if (is_strided) {
746             bool    has_backend_strides;
747             CeedInt num_elem, elem_size;
748 
749             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
750             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
751             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
752             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
753 
754             if (!has_backend_strides) {
755               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
756             }
757             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
758             code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", " << strides[0] << ", " << strides[1] << ", "
759                  << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
760           } else {
761             CeedSize                  l_size = 0;
762             CeedInt                   comp_stride;
763             CeedElemRestriction_Cuda *rstr_data;
764 
765             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
766             code << "      const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
767             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
768             code << "      // CompStride: " << comp_stride << "\n";
769             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
770             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
771             code << "      ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
772                  << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
773           }
774           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
775           break;
776         case CEED_EVAL_INTERP:
777           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
778           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
779           code << "        r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
780           code << "      }\n";
781           break;
782         case CEED_EVAL_GRAD:
783           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
784           code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix
785                << ", r_s" << var_suffix << ");\n";
786           break;
787         case CEED_EVAL_WEIGHT:
788           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
789           code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
790           break;
791           // LCOV_EXCL_START
792         case CEED_EVAL_DIV:
793         case CEED_EVAL_CURL:
794           break;  // TODO: Not implemented
795                   // LCOV_EXCL_STOP
796       }
797     }
798     code << "\n      // -- Output fields\n";
799     for (CeedInt i = 0; i < num_output_fields; i++) {
800       std::string var_suffix = "_out_" + std::to_string(i);
801 
802       code << "      // ---- Output field " << i << "\n";
803       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
804       // Basis action
805       switch (eval_mode) {
806         case CEED_EVAL_NONE:
807           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
808           break;
809         case CEED_EVAL_INTERP:
810           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
811           break;
812         case CEED_EVAL_GRAD:
813           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
814           break;
815           // LCOV_EXCL_START
816         case CEED_EVAL_WEIGHT:
817           break;  // Should not occur
818         case CEED_EVAL_DIV:
819         case CEED_EVAL_CURL:
820           break;  // TODO: Not implemented
821                   // LCOV_EXCL_STOP
822       }
823     }
824   } else {
825     code << "\n    // Note: Using full elements\n";
826     code << "    {\n";
827     code << "      // -- Input fields\n";
828     for (CeedInt i = 0; i < num_input_fields; i++) {
829       code << "      // ---- Input field " << i << "\n";
830       code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
831     }
832     code << "      // -- Output fields\n";
833     for (CeedInt i = 0; i < num_output_fields; i++) {
834       code << "      // ---- Output field " << i << "\n";
835       code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
836     }
837   }
838 
839   // Input and output buffers
840   code << "\n      // -- QFunction inputs and outputs\n";
841   code << "      // ---- Inputs\n";
842   code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
843   for (CeedInt i = 0; i < num_input_fields; i++) {
844     code << "      // ------ Input field " << i << "\n";
845     code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
846   }
847   code << "      // ---- Outputs\n";
848   code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
849   for (CeedInt i = 0; i < num_output_fields; i++) {
850     code << "      // ------ Output field " << i << "\n";
851     code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
852   }
853 
854   // Apply QFunction
855   code << "\n      // -- Apply QFunction\n";
856   code << "      " << qfunction_name << "(ctx, ";
857   if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
858     code << "1";
859   } else {
860     code << Q_name;
861   }
862   code << ", inputs, outputs);\n";
863 
864   if (is_at_points) {
865     // Map back to coefficients
866     code << "\n      // -- Output fields\n";
867     for (CeedInt i = 0; i < num_output_fields; i++) {
868       std::string var_suffix = "_out_" + std::to_string(i);
869       std::string P_name     = "P_1d" + var_suffix;
870 
871       code << "      // ---- Output field " << i << "\n";
872       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
873       // Basis action
874       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
875       switch (eval_mode) {
876         case CEED_EVAL_NONE: {
877           CeedInt             comp_stride;
878           CeedElemRestriction elem_rstr;
879 
880           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
881           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
882           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
883           code << "      const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
884           code << "      WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
885                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
886                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
887           break;
888         }
889         case CEED_EVAL_INTERP:
890           code << "      if (i >= points.num_per_elem[elem]) {\n";
891           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
892           code << "      }\n";
893           code << "      InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
894                << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
895           break;
896         case CEED_EVAL_GRAD:
897           code << "      if (i >= points.num_per_elem[elem]) {\n";
898           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n";
899           code << "      }\n";
900           code << "      GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
901                << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
902           break;
903           // LCOV_EXCL_START
904         case CEED_EVAL_WEIGHT:
905           break;  // Should not occur
906         case CEED_EVAL_DIV:
907         case CEED_EVAL_CURL:
908           break;  // TODO: Not implemented
909                   // LCOV_EXCL_STOP
910       }
911     }
912   } else if (use_3d_slices) {
913     // Copy or apply transpose grad, if needed
914     code << "\n      // -- Output fields\n";
915     for (CeedInt i = 0; i < num_output_fields; i++) {
916       std::string var_suffix = "_out_" + std::to_string(i);
917       std::string P_name     = "P_1d" + var_suffix;
918 
919       code << "      // ---- Output field " << i << "\n";
920       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
921       // Basis action
922       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
923       switch (eval_mode) {
924         case CEED_EVAL_NONE:
925           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
926           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
927           code << "      }\n";
928           break;
929         case CEED_EVAL_INTERP:
930           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
931           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
932           code << "      }\n";
933           break;
934         case CEED_EVAL_GRAD:
935           code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G"
936                << var_suffix << ", r_q" << var_suffix << ");\n";
937           break;
938           // LCOV_EXCL_START
939         case CEED_EVAL_WEIGHT:
940           break;  // Should not occur
941         case CEED_EVAL_DIV:
942         case CEED_EVAL_CURL:
943           break;  // TODO: Not implemented
944                   // LCOV_EXCL_STOP
945       }
946     }
947   }
948   code << "    }\n";
949   return CEED_ERROR_SUCCESS;
950 }
951 
952 //------------------------------------------------------------------------------
953 // Build single operator kernel
954 //------------------------------------------------------------------------------
955 extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
956   bool                    is_tensor = true, is_at_points = false, use_3d_slices = false;
957   Ceed                    ceed;
958   CeedInt                 Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
959   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
960   CeedQFunction_Cuda_gen *qf_data;
961   CeedQFunction           qf;
962   CeedOperatorField      *op_input_fields, *op_output_fields;
963   CeedOperator_Cuda_gen  *data;
964   std::ostringstream      code;
965 
966   CeedCallBackend(CeedOperatorGetData(op, &data));
967   {
968     bool is_setup_done;
969 
970     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
971     if (is_setup_done) {
972       *is_good_build = !data->use_fallback;
973       return CEED_ERROR_SUCCESS;
974     }
975   }
976 
977   // Check field compatibility
978   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
979   {
980     bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
981 
982     for (CeedInt i = 0; i < num_input_fields; i++) {
983       CeedBasis basis;
984 
985       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
986       if (basis != CEED_BASIS_NONE) {
987         bool        is_tensor = true;
988         const char *resource;
989         char       *resource_root;
990         Ceed        basis_ceed;
991 
992         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
993         is_all_tensor    = is_all_tensor && is_tensor;
994         is_all_nontensor = is_all_not_tensor && !is_tensor;
995         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
996         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
997         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
998         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
999         CeedCallBackend(CeedFree(&resource_root));
1000         CeedCallBackend(CeedDestroy(&basis_ceed));
1001       }
1002       CeedCallBackend(CeedBasisDestroy(&basis));
1003     }
1004 
1005     for (CeedInt i = 0; i < num_output_fields; i++) {
1006       CeedBasis basis;
1007 
1008       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1009       if (basis != CEED_BASIS_NONE) {
1010         bool        is_tensor = true;
1011         const char *resource;
1012         char       *resource_root;
1013         Ceed        basis_ceed;
1014 
1015         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1016         is_all_tensor    = is_all_tensor && is_tensor;
1017         is_all_nontensor = is_all_nontensor && !is_tensor;
1018 
1019         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1020         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1021         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1022         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
1023         CeedCallBackend(CeedFree(&resource_root));
1024         CeedCallBackend(CeedDestroy(&basis_ceed));
1025       }
1026       CeedCallBackend(CeedBasisDestroy(&basis));
1027     }
1028     // -- Fallback to ref if not all bases are shared
1029     if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
1030       *is_good_build = false;
1031       return CEED_ERROR_SUCCESS;
1032     }
1033   }
1034   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1035   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1036   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1037   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1038 
1039   // Get operator data
1040   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1041   CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
1042                                                        qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
1043   if (dim == 0) dim = 1;
1044   data->dim = dim;
1045   if (is_at_points) {
1046     CeedElemRestriction_Cuda *rstr_data;
1047     CeedElemRestriction       rstr_points = NULL;
1048 
1049     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1050     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1051     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1052     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1053     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1054     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1055   }
1056   if (is_at_points) use_3d_slices = false;
1057   if (Q_1d == 0) {
1058     if (is_at_points) Q_1d = max_num_points;
1059     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1060   }
1061   data->Q_1d = Q_1d;
1062 
1063   // Check for restriction only identity operator
1064   {
1065     bool is_identity_qf;
1066 
1067     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1068     if (is_identity_qf) {
1069       CeedEvalMode eval_mode_in, eval_mode_out;
1070 
1071       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1072       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1073       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1074                 "Backend does not implement restriction only identity operators");
1075     }
1076   }
1077 
1078   // Add atomicAdd function for old NVidia architectures
1079   {
1080     Ceed_Cuda            *ceed_data;
1081     struct cudaDeviceProp prop;
1082 
1083     CeedCallBackend(CeedGetData(ceed, &ceed_data));
1084     CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
1085     if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
1086       code << "// AtomicAdd fallback source\n";
1087       code << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
1088     }
1089   }
1090 
1091   // Load basis source files
1092   if (is_tensor) {
1093     code << "// Tensor basis source\n";
1094     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
1095   } else {
1096     code << "// Non-tensor basis source\n";
1097     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
1098   }
1099   if (is_at_points) {
1100     code << "// AtPoints basis source\n";
1101     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
1102   }
1103   code << "// CodeGen operator source\n";
1104   code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
1105 
1106   // Get QFunction name
1107   std::string qfunction_name(qf_data->qfunction_name);
1108   std::string operator_name;
1109 
1110   operator_name = "CeedKernelCudaGenOperator_" + qfunction_name;
1111 
1112   // Define CEED_Q_VLA
1113   code << "\n#undef CEED_Q_VLA\n";
1114   if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
1115     code << "#define CEED_Q_VLA 1\n\n";
1116   } else {
1117     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1118   }
1119 
1120   // Add user QFunction source
1121   {
1122     const char *source_path;
1123 
1124     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1125     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
1126 
1127     code << "// User QFunction source\n";
1128     code << "#include \"" << source_path << "\"\n\n";
1129   }
1130 
1131   // Setup
1132   code << "\n// -----------------------------------------------------------------------------\n";
1133   code << "// Operator Kernel\n";
1134   code << "// \n";
1135   code << "// d_[in,out]_i:   CeedVector device array\n";
1136   code << "// r_[in,out]_e_i: Element vector register\n";
1137   code << "// r_[in,out]_q_i: Quadrature space vector register\n";
1138   code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1139   code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1140   code << "// \n";
1141   code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1142   code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1143   code << "// -----------------------------------------------------------------------------\n";
1144   code << "extern \"C\" __global__ void " << operator_name
1145        << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
1146           "points) {\n";
1147 
1148   // Scratch buffers
1149   for (CeedInt i = 0; i < num_input_fields; i++) {
1150     CeedEvalMode eval_mode;
1151 
1152     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1153     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1154       code << "  const CeedScalar *d_in_" << i << " = fields.inputs[" << i << "];\n";
1155     }
1156   }
1157   for (CeedInt i = 0; i < num_output_fields; i++) {
1158     code << "  CeedScalar *d_out_" << i << " = fields.outputs[" << i << "];\n";
1159   }
1160 
1161   code << "  const CeedInt dim = " << dim << ";\n";
1162   code << "  const CeedInt " << (is_tensor ? "Q_1d" : "Q") << " = " << Q_1d << ";\n";
1163   if (is_at_points) {
1164     code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
1165     code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1166   }
1167 
1168   // Shared data
1169   code << "  extern __shared__ CeedScalar slice[];\n";
1170   code << "  SharedData_Cuda data;\n";
1171   code << "  data.t_id_x = threadIdx.x;\n";
1172   code << "  data.t_id_y = threadIdx.y;\n";
1173   code << "  data.t_id_z = threadIdx.z;\n";
1174   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1175   code << "  data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
1176 
1177   // -- Determine input mat reuse
1178   CeedInt input_matrix_reuse[CEED_FIELD_MAX][3];  // field, is_input, eval_mode
1179 
1180   for (CeedInt i = 0; i < num_input_fields; i++) {
1181     input_matrix_reuse[i][0] = -1;
1182   }
1183   for (CeedInt i = 0; i < num_input_fields; i++) {
1184     CeedEvalMode eval_mode_i;
1185     CeedBasis    basis_i;
1186 
1187     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1188     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1189     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1190     for (CeedInt j = 0; (input_matrix_reuse[i][0] == -1) && (j < i); j++) {
1191       CeedEvalMode eval_mode_j;
1192       CeedBasis    basis_j;
1193 
1194       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1195       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1196       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1197       if (basis_i == basis_j) {
1198         if (is_tensor) {
1199           input_matrix_reuse[i][0] = j;
1200           input_matrix_reuse[i][1] = true;
1201           input_matrix_reuse[i][2] = eval_mode_j;
1202         } else {
1203           // For non-tensor can only re-use with the same eval mode
1204           if (eval_mode_i == eval_mode_j) {
1205             input_matrix_reuse[i][0] = j;
1206             input_matrix_reuse[i][1] = true;
1207             input_matrix_reuse[i][2] = eval_mode_j;
1208           }
1209         }
1210       }
1211       CeedCallBackend(CeedBasisDestroy(&basis_j));
1212     }
1213     CeedCallBackend(CeedBasisDestroy(&basis_i));
1214   }
1215 
1216   // -- Determine output mat reuse
1217   CeedInt output_matrix_reuse[CEED_FIELD_MAX][3];  // field, is_input, eval_mode
1218 
1219   for (CeedInt i = 0; i < num_output_fields; i++) {
1220     output_matrix_reuse[i][0] = -1;
1221   }
1222   for (CeedInt i = 0; i < num_output_fields; i++) {
1223     CeedEvalMode eval_mode_i;
1224     CeedBasis    basis_i;
1225 
1226     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1227     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1228     for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < num_input_fields); j++) {
1229       CeedEvalMode eval_mode_j;
1230       CeedBasis    basis_j;
1231 
1232       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1233       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1234       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1235       if (basis_i == basis_j) {
1236         if (is_tensor) {
1237           output_matrix_reuse[i][0] = j;
1238           output_matrix_reuse[i][1] = true;
1239           output_matrix_reuse[i][2] = eval_mode_j;
1240         } else {
1241           // For non-tensor can only re-use with the same eval mode
1242           if (eval_mode_i == eval_mode_j) {
1243             output_matrix_reuse[i][0] = j;
1244             output_matrix_reuse[i][1] = true;
1245             output_matrix_reuse[i][2] = eval_mode_j;
1246           }
1247         }
1248       }
1249       CeedCallBackend(CeedBasisDestroy(&basis_j));
1250     }
1251     for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < i); j++) {
1252       CeedEvalMode eval_mode_j;
1253       CeedBasis    basis_j;
1254 
1255       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1256       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1257       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1258       if (basis_i == basis_j) {
1259         if (is_tensor) {
1260           output_matrix_reuse[i][0] = j;
1261           output_matrix_reuse[i][1] = false;
1262           output_matrix_reuse[i][2] = eval_mode_j;
1263         } else {
1264           // For non-tensor can only re-use with the same eval mode
1265           if (eval_mode_i == eval_mode_j) {
1266             output_matrix_reuse[i][0] = j;
1267             output_matrix_reuse[i][1] = false;
1268             output_matrix_reuse[i][2] = eval_mode_j;
1269           }
1270         }
1271       }
1272       CeedCallBackend(CeedBasisDestroy(&basis_j));
1273     }
1274     CeedCallBackend(CeedBasisDestroy(&basis_i));
1275   }
1276 
1277   // Initialize constants, and matrices B and G
1278   code << "\n  // Input field constants and basis data\n";
1279   for (CeedInt i = 0; i < num_input_fields; i++) {
1280     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d,
1281                                                               true, is_tensor, is_at_points, use_3d_slices));
1282   }
1283   code << "\n  // Output field constants and basis data\n";
1284   for (CeedInt i = 0; i < num_output_fields; i++) {
1285     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
1286                                                               false, is_tensor, is_at_points, use_3d_slices));
1287   }
1288 
1289   // Loop over all elements
1290   code << "\n  // Element loop\n";
1291   code << "  __syncthreads();\n";
1292   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1293 
1294   // -- Compute minimum buffer space needed
1295   CeedInt max_rstr_buffer_size = 1;
1296 
1297   for (CeedInt i = 0; i < num_input_fields; i++) {
1298     CeedInt             num_comp, elem_size;
1299     CeedElemRestriction elem_rstr;
1300 
1301     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1302     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1303     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1304     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
1305     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1306   }
1307   for (CeedInt i = 0; i < num_output_fields; i++) {
1308     CeedInt             num_comp, elem_size;
1309     CeedElemRestriction elem_rstr;
1310 
1311     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1312     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1313     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1314     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
1315     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1316   }
1317   code << "    // Scratch restriction buffer space\n";
1318   code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1319 
1320   // -- Determine best input field processing order
1321   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1322 
1323   for (CeedInt i = 0; i < num_input_fields; i++) {
1324     field_rstr_in_buffer[i] = -1;
1325     input_field_order[i]    = -1;
1326   }
1327   {
1328     bool    is_ordered[CEED_FIELD_MAX];
1329     CeedInt curr_index = 0;
1330 
1331     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1332     for (CeedInt i = 0; i < num_input_fields; i++) {
1333       CeedVector          vec_i;
1334       CeedElemRestriction rstr_i;
1335 
1336       if (is_ordered[i]) continue;
1337       field_rstr_in_buffer[i]       = i;
1338       is_ordered[i]                 = true;
1339       input_field_order[curr_index] = i;
1340       curr_index++;
1341       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1342       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1343       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1344       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1345         CeedVector          vec_j;
1346         CeedElemRestriction rstr_j;
1347 
1348         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1349         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1350         if (rstr_i == rstr_j && vec_i == vec_j) {
1351           field_rstr_in_buffer[j]       = i;
1352           is_ordered[j]                 = true;
1353           input_field_order[curr_index] = j;
1354           curr_index++;
1355         }
1356         CeedCallBackend(CeedVectorDestroy(&vec_j));
1357         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1358       }
1359       CeedCallBackend(CeedVectorDestroy(&vec_i));
1360       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1361     }
1362   }
1363 
1364   // -- Input restriction and basis
1365   code << "\n    // -- Input field restrictions and basis actions\n";
1366   for (CeedInt i = 0; i < num_input_fields; i++) {
1367     CeedInt f = input_field_order[i];
1368 
1369     code << "    // ---- Input field " << f << "\n";
1370 
1371     // ---- Restriction
1372     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1373                                                                 Q_1d, true, is_tensor, is_at_points, use_3d_slices));
1374 
1375     // ---- Basis action
1376     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_tensor,
1377                                                           is_at_points, use_3d_slices));
1378   }
1379 
1380   // -- Q function
1381   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
1382                                                             num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_tensor,
1383                                                             is_at_points, use_3d_slices));
1384 
1385   // -- Output basis and restriction
1386   code << "\n    // -- Output field basis action and restrictions\n";
1387   for (CeedInt i = 0; i < num_output_fields; i++) {
1388     code << "    // ---- Output field " << i << "\n";
1389 
1390     // ---- Basis action
1391     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
1392                                                           is_at_points, use_3d_slices));
1393 
1394     // ---- Restriction
1395     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
1396                                                                 is_tensor, is_at_points, use_3d_slices));
1397   }
1398 
1399   // Close loop and function
1400   code << "  }\n";
1401   code << "}\n";
1402   code << "// -----------------------------------------------------------------------------\n\n";
1403 
1404   // Compile
1405   {
1406     bool is_compile_good = false;
1407 
1408     CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
1409     if (is_compile_good) {
1410       *is_good_build = true;
1411       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
1412     } else {
1413       *is_good_build     = false;
1414       data->use_fallback = true;
1415     }
1416   }
1417   CeedCallBackend(CeedOperatorSetSetupDone(op));
1418   CeedCallBackend(CeedDestroy(&ceed));
1419   CeedCallBackend(CeedQFunctionDestroy(&qf));
1420   return CEED_ERROR_SUCCESS;
1421 }
1422 
1423 //------------------------------------------------------------------------------
1424