xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 6ab8e59f8abf7a04a361d274e3bc30e1205b0661)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/jit-tools.h>
13 #include <cuda_runtime.h>
14 
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 
19 #include "../cuda-ref/ceed-cuda-ref.h"
20 #include "../cuda-shared/ceed-cuda-shared.h"
21 #include "../cuda/ceed-cuda-common.h"
22 #include "../cuda/ceed-cuda-compile.h"
23 #include "ceed-cuda-gen.h"
24 
25 struct FieldReuse_Cuda {
26   CeedInt      index;
27   bool         is_input;
28   CeedEvalMode eval_mode;
29 };
30 
31 //------------------------------------------------------------------------------
32 // Determine type of operator
33 //------------------------------------------------------------------------------
34 static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
35                                                 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
36                                                 CeedQFunctionField *qf_output_fields, CeedInt *max_P_1d, CeedInt *Q_1d, CeedInt *dim, bool *is_tensor,
37                                                 bool *use_3d_slices) {
38   // Find dim, P_1d, Q_1d
39   *max_P_1d  = 0;
40   *Q_1d      = 0;
41   *dim       = 0;
42   *is_tensor = true;
43 
44   for (CeedInt i = 0; i < num_input_fields; i++) {
45     CeedBasis basis;
46 
47     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
48     if (basis != CEED_BASIS_NONE) {
49       bool    is_field_tensor;
50       CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
51 
52       // Collect dim, P_1d, and Q_1d
53       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
54       *is_tensor = *is_tensor && is_field_tensor;
55       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
56       else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
57       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
58       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
59       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
60       *dim = field_dim;
61       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
62       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
63       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
64       *Q_1d = field_Q_1d;
65     }
66     CeedCallBackend(CeedBasisDestroy(&basis));
67   }
68   for (CeedInt i = 0; i < num_output_fields; i++) {
69     CeedBasis basis;
70 
71     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
72     if (basis != CEED_BASIS_NONE) {
73       bool    is_field_tensor;
74       CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
75 
76       // Collect dim, P_1d, and Q_1d
77       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
78       *is_tensor = *is_tensor && is_field_tensor;
79       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
80       else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
81       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
82       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
83       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
84       *dim = field_dim;
85       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
86       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
87       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
88       *Q_1d = field_Q_1d;
89     }
90     CeedCallBackend(CeedBasisDestroy(&basis));
91   }
92 
93   // Only use 3D collocated gradient parallelization strategy when gradient is computed
94   *use_3d_slices = false;
95   if (*dim == 3) {
96     bool was_grad_found = false;
97 
98     for (CeedInt i = 0; i < num_input_fields; i++) {
99       CeedEvalMode eval_mode;
100 
101       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
102       if (eval_mode == CEED_EVAL_GRAD) {
103         CeedBasis_Cuda_shared *basis_data;
104         CeedBasis              basis;
105 
106         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
107         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
108         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
109         was_grad_found = true;
110         CeedCallBackend(CeedBasisDestroy(&basis));
111       }
112     }
113     for (CeedInt i = 0; i < num_output_fields; i++) {
114       CeedEvalMode eval_mode;
115 
116       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
117       if (eval_mode == CEED_EVAL_GRAD) {
118         CeedBasis_Cuda_shared *basis_data;
119         CeedBasis              basis;
120 
121         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
122         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
123         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
124         was_grad_found = true;
125         CeedCallBackend(CeedBasisDestroy(&basis));
126       }
127     }
128   }
129   return CEED_ERROR_SUCCESS;
130 }
131 
132 //------------------------------------------------------------------------------
133 // Setup fields
134 //------------------------------------------------------------------------------
135 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
136                                                      CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt Q_1d, bool is_input,
137                                                      bool is_tensor, bool is_at_points, bool use_3d_slices) {
138   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
139   std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
140   std::string            option_name = (is_input ? "inputs" : "outputs");
141   CeedEvalMode           eval_mode   = CEED_EVAL_NONE;
142   CeedInt                elem_size = 0, num_comp = 0, P_1d = 0;
143   CeedElemRestriction    elem_rstr;
144   CeedBasis_Cuda_shared *basis_data;
145   CeedBasis              basis;
146 
147   // Field reuse info
148   bool use_previous_field = field_reuse.index != -1;
149 
150   code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
151 
152   // Get field data
153   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
154   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
155     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
156     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
157   }
158   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
159   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
160   if (basis != CEED_BASIS_NONE) {
161     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
162     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
163     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
164   }
165   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
166 
167   // Set field constants
168   if (eval_mode != CEED_EVAL_WEIGHT) {
169     code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
170     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
171   }
172 
173   // Load basis data
174   code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
175   switch (eval_mode) {
176     case CEED_EVAL_NONE:
177       break;
178     case CEED_EVAL_INTERP:
179       if (is_at_points) {
180         // AtPoints
181         if (!basis_data->d_chebyshev_interp_1d) {
182           CeedSize    interp_bytes;
183           CeedScalar *chebyshev_interp_1d;
184 
185           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
186           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
187           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
188           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
189           CeedCallCuda(CeedBasisReturnCeed(basis),
190                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
191           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
192         }
193         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
194         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
195       } else {
196         // Standard quadrature
197         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
198         else data->B.outputs[i] = basis_data->d_interp_1d;
199       }
200       if (use_previous_field) {
201         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
202 
203         code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
204       } else {
205         code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
206         code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
207       }
208       break;
209     case CEED_EVAL_GRAD:
210       if (is_at_points) {
211         // AtPoints
212         if (!basis_data->d_chebyshev_interp_1d) {
213           CeedSize    interp_bytes;
214           CeedScalar *chebyshev_interp_1d;
215 
216           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
217           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
218           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
219           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
220           CeedCallCuda(CeedBasisReturnCeed(basis),
221                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
222           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
223         }
224         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
225         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
226       } else {
227         // Standard quadrature
228         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
229         else data->B.outputs[i] = basis_data->d_interp_1d;
230       }
231       if (is_tensor) {
232         if (use_previous_field) {
233           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
234 
235           code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
236         } else {
237           code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
238           code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
239         }
240       }
241       if (is_at_points) break;  // No G mat for AtPoints
242       if (use_3d_slices) {
243         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
244         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
245         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
246           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
247 
248           code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
249         } else {
250           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
251           code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
252         }
253       } else {
254         bool has_collo_grad = basis_data->d_collo_grad_1d;
255 
256         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
257         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
258         if (has_collo_grad) {
259           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
260             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
261 
262             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
263           } else {
264             code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
265             code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
266           }
267         } else {
268           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
269             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
270 
271             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
272           } else {
273             code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
274             code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
275                  << var_suffix << ");\n";
276           }
277         }
278       }
279       break;
280     case CEED_EVAL_WEIGHT:
281       break;  // No action
282       // LCOV_EXCL_START
283     case CEED_EVAL_DIV:
284     case CEED_EVAL_CURL:
285       break;  // TODO: Not implemented
286               // LCOV_EXCL_STOP
287   }
288   CeedCallBackend(CeedBasisDestroy(&basis));
289   return CEED_ERROR_SUCCESS;
290 }
291 
292 //------------------------------------------------------------------------------
293 // Restriction
294 //------------------------------------------------------------------------------
295 static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
296                                                        CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
297                                                        CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points, bool use_3d_slices) {
298   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
299   std::string               P_name     = (is_tensor ? "P_1d" : "P") + var_suffix;
300   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
301   CeedInt                   elem_size = 0, num_comp = 0, P_1d = 0;
302   CeedSize                  l_size;
303   CeedRestrictionType       rstr_type = CEED_RESTRICTION_STANDARD;
304   CeedElemRestriction_Cuda *rstr_data;
305   CeedElemRestriction       elem_rstr;
306   CeedBasis                 basis;
307 
308   // Get field data
309   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
310   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
311     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
312     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
313     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
314     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
315   }
316   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
317   if (basis != CEED_BASIS_NONE) {
318     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
319     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
320   }
321   CeedCallBackend(CeedBasisDestroy(&basis));
322   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
323 
324   // Restriction
325   if (is_input) {
326     // Input
327     if (field_input_buffer[i] != i) {
328       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
329 
330       // Restriction was already done for previous input
331       code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
332     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
333       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
334         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
335         code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
336       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
337         // Otherwise we're using the scratch space
338         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
339       }
340       switch (rstr_type) {
341         case CEED_RESTRICTION_STANDARD: {
342           CeedInt comp_stride;
343 
344           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
345           code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
346           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
347           code << "    // CompStride: " << comp_stride << "\n";
348           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
349           code << "    ReadLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
350                << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
351           break;
352         }
353         case CEED_RESTRICTION_STRIDED: {
354           bool    has_backend_strides;
355           CeedInt num_elem;
356 
357           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
358           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
359           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
360 
361           if (!has_backend_strides) {
362             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
363           }
364           code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
365           code << "    ReadLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
366                << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
367           break;
368         }
369         case CEED_RESTRICTION_POINTS: {
370           CeedInt comp_stride;
371 
372           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
373           code << "    const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
374           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
375           break;
376         }
377         // LCOV_EXCL_START
378         case CEED_RESTRICTION_ORIENTED:
379         case CEED_RESTRICTION_CURL_ORIENTED:
380           break;  // TODO: Not implemented
381                   // LCOV_EXCL_STOP
382       }
383     }
384   } else {
385     // Output
386     switch (rstr_type) {
387       case CEED_RESTRICTION_STANDARD: {
388         CeedInt comp_stride;
389 
390         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
391         code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
392         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
393         code << "    // CompStride: " << comp_stride << "\n";
394         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
395         code << "    WriteLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
396              << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
397         break;
398       }
399       case CEED_RESTRICTION_STRIDED: {
400         bool    has_backend_strides;
401         CeedInt num_elem;
402 
403         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
404         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
405         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
406 
407         if (!has_backend_strides) {
408           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
409         }
410         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
411         code << "    WriteLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
412              << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
413         break;
414       }
415       case CEED_RESTRICTION_POINTS:
416         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
417         break;
418       // LCOV_EXCL_START
419       case CEED_RESTRICTION_ORIENTED:
420       case CEED_RESTRICTION_CURL_ORIENTED:
421         break;  // TODO: Not implemented
422                 // LCOV_EXCL_STOP
423     }
424   }
425   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
426   return CEED_ERROR_SUCCESS;
427 }
428 
429 //------------------------------------------------------------------------------
430 // Basis
431 //------------------------------------------------------------------------------
432 static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
433                                                  CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor,
434                                                  bool is_at_points, bool use_3d_slices) {
435   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
436   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
437   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
438   CeedInt             elem_size = 0, num_comp = 0, P_1d = 0;
439   CeedElemRestriction elem_rstr;
440   CeedBasis           basis;
441 
442   // Get field data
443   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
444   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
445     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
446     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
447   }
448   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
449   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
450   if (basis != CEED_BASIS_NONE) {
451     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
452     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
453   }
454   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
455 
456   // Basis
457   code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
458   if (is_input) {
459     switch (eval_mode) {
460       case CEED_EVAL_NONE:
461         if (!use_3d_slices && !is_at_points) {
462           code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
463         }
464         break;
465       case CEED_EVAL_INTERP:
466         if (is_at_points) {
467           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
468 
469           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
470           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
471                << var_suffix << ", r_c" << var_suffix << ");\n";
472         } else {
473           std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d") : "InterpNonTensor";
474 
475           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
476           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
477                << var_suffix << ", r_q" << var_suffix << ");\n";
478         }
479         break;
480       case CEED_EVAL_GRAD:
481         if (is_at_points) {
482           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
483 
484           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
485           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
486                << var_suffix << ", r_c" << var_suffix << ");\n";
487         } else if (use_3d_slices) {
488           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
489 
490           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
491           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
492                << var_suffix << ", r_q" << var_suffix << ");\n";
493         } else if (is_tensor) {
494           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
495           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d";
496 
497           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (dim >= 3 ? Q_name : "1") << "];\n";
498           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
499                << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
500         } else {
501           std::string function_name = "GradNonTensor";
502 
503           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
504           code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix
505                << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
506         }
507         break;
508       case CEED_EVAL_WEIGHT: {
509         if (is_at_points) {
510           code << "    // Nothing to do AtPoints\n";
511         } else {
512           CeedBasis_Cuda_shared *basis_data;
513           std::string            function_name = is_tensor ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d") : "WeightNonTensor";
514 
515           code << "    CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
516           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
517           data->W = basis_data->d_q_weight_1d;
518           code << "    " << function_name << "<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
519         }
520         break;
521       }
522       // LCOV_EXCL_START
523       case CEED_EVAL_DIV:
524       case CEED_EVAL_CURL:
525         break;  // TODO: Not implemented
526                 // LCOV_EXCL_STOP
527     }
528   } else {
529     switch (eval_mode) {
530       case CEED_EVAL_NONE:
531         code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
532         break;  // No action
533       case CEED_EVAL_INTERP:
534         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
535         if (is_at_points) {
536           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
537 
538           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
539                << var_suffix << ", r_e" << var_suffix << ");\n";
540         } else {
541           std::string function_name =
542               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d") : "InterpTransposeNonTensor";
543 
544           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
545                << var_suffix << ", r_e" << var_suffix << ");\n";
546         }
547         break;
548       case CEED_EVAL_GRAD:
549         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
550         if (is_at_points) {
551           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
552 
553           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
554                << var_suffix << ", r_e" << var_suffix << ");\n";
555         } else if (use_3d_slices) {
556           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
557 
558           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
559                << var_suffix << ", r_e" << var_suffix << ");\n";
560         } else if (is_tensor) {
561           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
562           std::string function_name =
563               (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) + std::to_string(dim) + "d";
564 
565           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
566                << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
567         } else {
568           std::string function_name = "GradTransposeNonTensor";
569 
570           code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix
571                << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
572         }
573         break;
574       // LCOV_EXCL_START
575       case CEED_EVAL_WEIGHT:
576         break;  // Should not occur
577       case CEED_EVAL_DIV:
578       case CEED_EVAL_CURL:
579         break;  // TODO: Not implemented
580                 // LCOV_EXCL_STOP
581     }
582   }
583   CeedCallBackend(CeedBasisDestroy(&basis));
584   return CEED_ERROR_SUCCESS;
585 }
586 
587 //------------------------------------------------------------------------------
588 // QFunction
589 //------------------------------------------------------------------------------
590 static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt dim, CeedInt max_num_points,
591                                                      CeedInt num_input_fields, CeedOperatorField *op_input_fields,
592                                                      CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
593                                                      CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
594                                                      std::string qfunction_name, CeedInt Q_1d, bool is_tensor, bool is_at_points,
595                                                      bool use_3d_slices) {
596   std::string         Q_name    = is_tensor ? "Q_1d" : "Q";
597   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
598   CeedElemRestriction elem_rstr;
599 
600   // Setup output arrays
601   code << "\n    // -- Output field setup\n";
602   for (CeedInt i = 0; i < num_output_fields; i++) {
603     std::string var_suffix = "_out_" + std::to_string(i);
604 
605     code << "    // ---- Output field " << i << "\n";
606     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
607     switch (eval_mode) {
608       case CEED_EVAL_NONE:
609         if (is_at_points) {
610           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
611         } else {
612           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
613         }
614         break;
615       case CEED_EVAL_INTERP:
616         if (is_at_points) {
617           // Accumulator for point data
618           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
619           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
620           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
621           code << "    }\n";
622         } else {
623           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
624         }
625         break;
626       case CEED_EVAL_GRAD:
627         if (is_at_points) {
628           // Accumulator for point data
629           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "*dim];\n";
630           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
631           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
632           code << "    }\n";
633         } else if (use_3d_slices) {
634           // Accumulator for gradient slices
635           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
636           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
637           code << "      r_q" << var_suffix << "[i] = 0.0;\n";
638           code << "    }\n";
639         } else {
640           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
641         }
642         break;
643       case CEED_EVAL_WEIGHT:
644         break;
645         // LCOV_EXCL_START
646       case CEED_EVAL_DIV:
647       case CEED_EVAL_CURL:
648         break;  // TODO: Not implemented
649                 // LCOV_EXCL_STOP
650     }
651   }
652 
653   if (is_at_points) {
654     // We need to handle batches of points
655     code << "\n    // Note: Using batches of points\n";
656     code << "    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));\n\n";
657     code << "    #pragma unroll\n";
658     code << "    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {\n";
659     code << "      const CeedInt p = i % max_num_points;\n\n";
660 
661     code << "      // -- Coordinates\n";
662     code << "      CeedScalar r_x[dim];\n";
663     code << "      ReadPoint<dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
664 
665     code << "      // -- Input fields\n";
666     for (CeedInt i = 0; i < num_input_fields; i++) {
667       std::string var_suffix = "_in_" + std::to_string(i);
668 
669       code << "      // ---- Input field " << i << "\n";
670       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
671       // Basis action
672       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
673       switch (eval_mode) {
674         case CEED_EVAL_NONE:
675           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
676           code << "      ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
677                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
678           break;
679         case CEED_EVAL_INTERP:
680           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
681           code << "      InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
682                << ", r_x, r_s" << var_suffix << ");\n";
683           break;
684         case CEED_EVAL_GRAD:
685           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
686           code << "      GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
687                << ", r_x, r_s" << var_suffix << ");\n";
688           break;
689         case CEED_EVAL_WEIGHT:
690           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
691           code << "      r_s" << var_suffix << "[0] = 1.0;\n";
692           break;
693           // LCOV_EXCL_START
694         case CEED_EVAL_DIV:
695         case CEED_EVAL_CURL:
696           break;  // TODO: Not implemented
697                   // LCOV_EXCL_STOP
698       }
699     }
700     code << "\n      // -- Output fields\n";
701     for (CeedInt i = 0; i < num_output_fields; i++) {
702       std::string var_suffix = "_out_" + std::to_string(i);
703 
704       code << "      // ---- Output field " << i << "\n";
705       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
706       // Basis action
707       switch (eval_mode) {
708         case CEED_EVAL_NONE:
709           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
710           break;
711         case CEED_EVAL_INTERP:
712           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
713           break;
714         case CEED_EVAL_GRAD:
715           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
716           break;
717           // LCOV_EXCL_START
718         case CEED_EVAL_WEIGHT:
719           break;  // Should not occur
720         case CEED_EVAL_DIV:
721         case CEED_EVAL_CURL:
722           break;  // TODO: Not implemented
723                   // LCOV_EXCL_STOP
724       }
725     }
726 
727   } else if (use_3d_slices) {
728     // We treat quadrature points per slice in 3d to save registers
729     code << "\n    // Note: Using planes of 3D elements\n";
730     code << "    #pragma unroll\n";
731     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
732     code << "      // -- Input fields\n";
733     for (CeedInt i = 0; i < num_input_fields; i++) {
734       std::string var_suffix = "_in_" + std::to_string(i);
735 
736       code << "      // ---- Input field " << i << "\n";
737       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
738       // Basis action
739       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
740       switch (eval_mode) {
741         case CEED_EVAL_NONE:
742           bool is_strided;
743 
744           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
745 
746           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
747           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
748           if (is_strided) {
749             bool    has_backend_strides;
750             CeedInt num_elem, elem_size;
751 
752             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
753             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
754             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
755             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
756 
757             if (!has_backend_strides) {
758               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
759             }
760             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
761             code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", " << strides[0] << ", " << strides[1] << ", "
762                  << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
763           } else {
764             CeedSize                  l_size = 0;
765             CeedInt                   comp_stride;
766             CeedElemRestriction_Cuda *rstr_data;
767 
768             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
769             code << "      const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
770             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
771             code << "      // CompStride: " << comp_stride << "\n";
772             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
773             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
774             code << "      ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
775                  << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
776           }
777           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
778           break;
779         case CEED_EVAL_INTERP:
780           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
781           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
782           code << "        r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
783           code << "      }\n";
784           break;
785         case CEED_EVAL_GRAD:
786           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
787           code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix
788                << ", r_s" << var_suffix << ");\n";
789           break;
790         case CEED_EVAL_WEIGHT:
791           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
792           code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
793           break;
794           // LCOV_EXCL_START
795         case CEED_EVAL_DIV:
796         case CEED_EVAL_CURL:
797           break;  // TODO: Not implemented
798                   // LCOV_EXCL_STOP
799       }
800     }
801     code << "\n      // -- Output fields\n";
802     for (CeedInt i = 0; i < num_output_fields; i++) {
803       std::string var_suffix = "_out_" + std::to_string(i);
804 
805       code << "      // ---- Output field " << i << "\n";
806       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
807       // Basis action
808       switch (eval_mode) {
809         case CEED_EVAL_NONE:
810           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
811           break;
812         case CEED_EVAL_INTERP:
813           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
814           break;
815         case CEED_EVAL_GRAD:
816           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
817           break;
818           // LCOV_EXCL_START
819         case CEED_EVAL_WEIGHT:
820           break;  // Should not occur
821         case CEED_EVAL_DIV:
822         case CEED_EVAL_CURL:
823           break;  // TODO: Not implemented
824                   // LCOV_EXCL_STOP
825       }
826     }
827   } else {
828     code << "\n    // Note: Using full elements\n";
829     code << "    {\n";
830     code << "      // -- Input fields\n";
831     for (CeedInt i = 0; i < num_input_fields; i++) {
832       code << "      // ---- Input field " << i << "\n";
833       code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
834     }
835     code << "      // -- Output fields\n";
836     for (CeedInt i = 0; i < num_output_fields; i++) {
837       code << "      // ---- Output field " << i << "\n";
838       code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
839     }
840   }
841 
842   // Input and output buffers
843   code << "\n      // -- QFunction inputs and outputs\n";
844   code << "      // ---- Inputs\n";
845   code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
846   for (CeedInt i = 0; i < num_input_fields; i++) {
847     code << "      // ------ Input field " << i << "\n";
848     code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
849   }
850   code << "      // ---- Outputs\n";
851   code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
852   for (CeedInt i = 0; i < num_output_fields; i++) {
853     code << "      // ------ Output field " << i << "\n";
854     code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
855   }
856 
857   // Apply QFunction
858   code << "\n      // -- Apply QFunction\n";
859   code << "      " << qfunction_name << "(ctx, ";
860   if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
861     code << "1";
862   } else {
863     code << Q_name;
864   }
865   code << ", inputs, outputs);\n";
866 
867   if (is_at_points) {
868     // Map back to coefficients
869     code << "\n      // -- Output fields\n";
870     for (CeedInt i = 0; i < num_output_fields; i++) {
871       std::string var_suffix = "_out_" + std::to_string(i);
872       std::string P_name     = "P_1d" + var_suffix;
873 
874       code << "      // ---- Output field " << i << "\n";
875       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
876       // Basis action
877       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
878       switch (eval_mode) {
879         case CEED_EVAL_NONE: {
880           CeedInt             comp_stride;
881           CeedElemRestriction elem_rstr;
882 
883           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
884           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
885           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
886           code << "      const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
887           code << "      WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
888                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
889                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
890           break;
891         }
892         case CEED_EVAL_INTERP:
893           code << "      if (i >= points.num_per_elem[elem]) {\n";
894           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
895           code << "      }\n";
896           code << "      InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
897                << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
898           break;
899         case CEED_EVAL_GRAD:
900           code << "      if (i >= points.num_per_elem[elem]) {\n";
901           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n";
902           code << "      }\n";
903           code << "      GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
904                << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
905           break;
906           // LCOV_EXCL_START
907         case CEED_EVAL_WEIGHT:
908           break;  // Should not occur
909         case CEED_EVAL_DIV:
910         case CEED_EVAL_CURL:
911           break;  // TODO: Not implemented
912                   // LCOV_EXCL_STOP
913       }
914     }
915   } else if (use_3d_slices) {
916     // Copy or apply transpose grad, if needed
917     code << "\n      // -- Output fields\n";
918     for (CeedInt i = 0; i < num_output_fields; i++) {
919       std::string var_suffix = "_out_" + std::to_string(i);
920       std::string P_name     = "P_1d" + var_suffix;
921 
922       code << "      // ---- Output field " << i << "\n";
923       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
924       // Basis action
925       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
926       switch (eval_mode) {
927         case CEED_EVAL_NONE:
928           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
929           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
930           code << "      }\n";
931           break;
932         case CEED_EVAL_INTERP:
933           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
934           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
935           code << "      }\n";
936           break;
937         case CEED_EVAL_GRAD:
938           code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G"
939                << var_suffix << ", r_q" << var_suffix << ");\n";
940           break;
941           // LCOV_EXCL_START
942         case CEED_EVAL_WEIGHT:
943           break;  // Should not occur
944         case CEED_EVAL_DIV:
945         case CEED_EVAL_CURL:
946           break;  // TODO: Not implemented
947                   // LCOV_EXCL_STOP
948       }
949     }
950   }
951   code << "    }\n";
952   return CEED_ERROR_SUCCESS;
953 }
954 
955 //------------------------------------------------------------------------------
956 // Build single operator kernel
957 //------------------------------------------------------------------------------
958 extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
959   bool                    is_tensor = true, is_at_points = false, use_3d_slices = false;
960   Ceed                    ceed;
961   CeedInt                 Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
962   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
963   CeedQFunction_Cuda_gen *qf_data;
964   CeedQFunction           qf;
965   CeedOperatorField      *op_input_fields, *op_output_fields;
966   CeedOperator_Cuda_gen  *data;
967   std::ostringstream      code;
968 
969   CeedCallBackend(CeedOperatorGetData(op, &data));
970   {
971     bool is_setup_done;
972 
973     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
974     if (is_setup_done) {
975       *is_good_build = !data->use_fallback;
976       return CEED_ERROR_SUCCESS;
977     }
978   }
979 
980   // Check field compatibility
981   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
982   {
983     bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
984 
985     for (CeedInt i = 0; i < num_input_fields; i++) {
986       CeedBasis basis;
987 
988       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
989       if (basis != CEED_BASIS_NONE) {
990         bool        is_tensor = true;
991         const char *resource;
992         char       *resource_root;
993         Ceed        basis_ceed;
994 
995         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
996         is_all_tensor    = is_all_tensor && is_tensor;
997         is_all_nontensor = is_all_nontensor && !is_tensor;
998         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
999         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1000         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1001         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
1002         CeedCallBackend(CeedFree(&resource_root));
1003         CeedCallBackend(CeedDestroy(&basis_ceed));
1004       }
1005       CeedCallBackend(CeedBasisDestroy(&basis));
1006     }
1007 
1008     for (CeedInt i = 0; i < num_output_fields; i++) {
1009       CeedBasis basis;
1010 
1011       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1012       if (basis != CEED_BASIS_NONE) {
1013         bool        is_tensor = true;
1014         const char *resource;
1015         char       *resource_root;
1016         Ceed        basis_ceed;
1017 
1018         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1019         is_all_tensor    = is_all_tensor && is_tensor;
1020         is_all_nontensor = is_all_nontensor && !is_tensor;
1021 
1022         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1023         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1024         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1025         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
1026         CeedCallBackend(CeedFree(&resource_root));
1027         CeedCallBackend(CeedDestroy(&basis_ceed));
1028       }
1029       CeedCallBackend(CeedBasisDestroy(&basis));
1030     }
1031     // -- Fallback to ref if not all bases are shared
1032     if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
1033       *is_good_build = false;
1034       return CEED_ERROR_SUCCESS;
1035     }
1036   }
1037   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1038   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1039   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1040   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1041 
1042   // Get operator data
1043   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1044   CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
1045                                                        qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
1046   if (dim == 0) dim = 1;
1047   data->dim = dim;
1048   if (is_at_points) {
1049     CeedElemRestriction_Cuda *rstr_data;
1050     CeedElemRestriction       rstr_points = NULL;
1051 
1052     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1053     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1054     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1055     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1056     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1057     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1058   }
1059   if (is_at_points) use_3d_slices = false;
1060   if (Q_1d == 0) {
1061     if (is_at_points) Q_1d = max_num_points;
1062     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1063   }
1064   data->Q_1d = Q_1d;
1065 
1066   // Check for restriction only identity operator
1067   {
1068     bool is_identity_qf;
1069 
1070     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1071     if (is_identity_qf) {
1072       CeedEvalMode eval_mode_in, eval_mode_out;
1073 
1074       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1075       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1076       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1077                 "Backend does not implement restriction only identity operators");
1078     }
1079   }
1080 
1081   // Add atomicAdd function for old NVidia architectures
1082   {
1083     Ceed_Cuda            *ceed_data;
1084     struct cudaDeviceProp prop;
1085 
1086     CeedCallBackend(CeedGetData(ceed, &ceed_data));
1087     CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
1088     if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
1089       code << "// AtomicAdd fallback source\n";
1090       code << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
1091     }
1092   }
1093 
1094   // Load basis source files
1095   if (is_tensor) {
1096     code << "// Tensor basis source\n";
1097     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
1098   } else {
1099     code << "// Non-tensor basis source\n";
1100     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
1101   }
1102   if (is_at_points) {
1103     code << "// AtPoints basis source\n";
1104     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
1105   }
1106   code << "// CodeGen operator source\n";
1107   code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
1108 
1109   // Get QFunction name
1110   std::string qfunction_name(qf_data->qfunction_name);
1111   std::string operator_name;
1112 
1113   operator_name = "CeedKernelCudaGenOperator_" + qfunction_name;
1114 
1115   // Define CEED_Q_VLA
1116   code << "\n#undef CEED_Q_VLA\n";
1117   if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
1118     code << "#define CEED_Q_VLA 1\n\n";
1119   } else {
1120     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1121   }
1122 
1123   // Add user QFunction source
1124   {
1125     const char *source_path;
1126 
1127     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1128     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
1129 
1130     code << "// User QFunction source\n";
1131     code << "#include \"" << source_path << "\"\n\n";
1132   }
1133 
1134   // Setup
1135   code << "\n// -----------------------------------------------------------------------------\n";
1136   code << "// Operator Kernel\n";
1137   code << "// \n";
1138   code << "// d_[in,out]_i:   CeedVector device array\n";
1139   code << "// r_[in,out]_e_i: Element vector register\n";
1140   code << "// r_[in,out]_q_i: Quadrature space vector register\n";
1141   code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1142   code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1143   code << "// \n";
1144   code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1145   code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1146   code << "// -----------------------------------------------------------------------------\n";
1147   code << "extern \"C\" __global__ void " << operator_name
1148        << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
1149           "points) {\n";
1150 
1151   // Scratch buffers
1152   for (CeedInt i = 0; i < num_input_fields; i++) {
1153     CeedEvalMode eval_mode;
1154 
1155     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1156     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1157       code << "  const CeedScalar *d_in_" << i << " = fields.inputs[" << i << "];\n";
1158     }
1159   }
1160   for (CeedInt i = 0; i < num_output_fields; i++) {
1161     code << "  CeedScalar *d_out_" << i << " = fields.outputs[" << i << "];\n";
1162   }
1163 
1164   code << "  const CeedInt dim = " << dim << ";\n";
1165   code << "  const CeedInt " << (is_tensor ? "Q_1d" : "Q") << " = " << Q_1d << ";\n";
1166   if (is_at_points) {
1167     code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
1168     code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1169   }
1170 
1171   // Shared data
1172   code << "  extern __shared__ CeedScalar slice[];\n";
1173   code << "  SharedData_Cuda data;\n";
1174   code << "  data.t_id_x = threadIdx.x;\n";
1175   code << "  data.t_id_y = threadIdx.y;\n";
1176   code << "  data.t_id_z = threadIdx.z;\n";
1177   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1178   code << "  data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
1179 
1180   // -- Determine input mat reuse
1181   FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
1182 
1183   for (CeedInt i = 0; i < num_input_fields; i++) {
1184     input_matrix_reuse[i].index = -1;
1185   }
1186   for (CeedInt i = 0; i < num_input_fields; i++) {
1187     CeedEvalMode eval_mode_i;
1188     CeedBasis    basis_i;
1189 
1190     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1191     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1192     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1193     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1194       CeedEvalMode eval_mode_j;
1195       CeedBasis    basis_j;
1196 
1197       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1198       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1199       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1200       if (basis_i == basis_j) {
1201         if (is_tensor) {
1202           input_matrix_reuse[i].index     = j;
1203           input_matrix_reuse[i].is_input  = true;
1204           input_matrix_reuse[i].eval_mode = eval_mode_j;
1205         } else {
1206           // For non-tensor can only re-use with the same eval mode
1207           if (eval_mode_i == eval_mode_j) {
1208             input_matrix_reuse[i].index     = j;
1209             input_matrix_reuse[i].is_input  = true;
1210             input_matrix_reuse[i].eval_mode = eval_mode_j;
1211           }
1212         }
1213       }
1214       CeedCallBackend(CeedBasisDestroy(&basis_j));
1215     }
1216     CeedCallBackend(CeedBasisDestroy(&basis_i));
1217   }
1218 
1219   // -- Determine output mat reuse
1220   FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX];
1221 
1222   for (CeedInt i = 0; i < num_output_fields; i++) {
1223     output_matrix_reuse[i].index = -1;
1224   }
1225   for (CeedInt i = 0; i < num_output_fields; i++) {
1226     CeedEvalMode eval_mode_i;
1227     CeedBasis    basis_i;
1228 
1229     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1230     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1231     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1232       CeedEvalMode eval_mode_j;
1233       CeedBasis    basis_j;
1234 
1235       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1236       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1237       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1238       if (basis_i == basis_j) {
1239         if (is_tensor) {
1240           output_matrix_reuse[i].index     = j;
1241           output_matrix_reuse[i].is_input  = true;
1242           output_matrix_reuse[i].eval_mode = eval_mode_j;
1243         } else {
1244           // For non-tensor can only re-use with the same eval mode
1245           if (eval_mode_i == eval_mode_j) {
1246             output_matrix_reuse[i].index     = j;
1247             output_matrix_reuse[i].is_input  = true;
1248             output_matrix_reuse[i].eval_mode = eval_mode_j;
1249           }
1250         }
1251       }
1252       CeedCallBackend(CeedBasisDestroy(&basis_j));
1253     }
1254     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1255       CeedEvalMode eval_mode_j;
1256       CeedBasis    basis_j;
1257 
1258       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1259       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1260       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1261       if (basis_i == basis_j) {
1262         if (is_tensor) {
1263           output_matrix_reuse[i].index     = j;
1264           output_matrix_reuse[i].is_input  = false;
1265           output_matrix_reuse[i].eval_mode = eval_mode_j;
1266         } else {
1267           // For non-tensor can only re-use with the same eval mode
1268           if (eval_mode_i == eval_mode_j) {
1269             output_matrix_reuse[i].index     = j;
1270             output_matrix_reuse[i].is_input  = false;
1271             output_matrix_reuse[i].eval_mode = eval_mode_j;
1272           }
1273         }
1274       }
1275       CeedCallBackend(CeedBasisDestroy(&basis_j));
1276     }
1277     CeedCallBackend(CeedBasisDestroy(&basis_i));
1278   }
1279 
1280   // Initialize constants, and matrices B and G
1281   code << "\n  // Input field constants and basis data\n";
1282   for (CeedInt i = 0; i < num_input_fields; i++) {
1283     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d,
1284                                                               true, is_tensor, is_at_points, use_3d_slices));
1285   }
1286   code << "\n  // Output field constants and basis data\n";
1287   for (CeedInt i = 0; i < num_output_fields; i++) {
1288     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
1289                                                               false, is_tensor, is_at_points, use_3d_slices));
1290   }
1291 
1292   // Loop over all elements
1293   code << "\n  // Element loop\n";
1294   code << "  __syncthreads();\n";
1295   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1296 
1297   // -- Compute minimum buffer space needed
1298   CeedInt max_rstr_buffer_size = 1;
1299 
1300   for (CeedInt i = 0; i < num_input_fields; i++) {
1301     CeedInt             num_comp, elem_size;
1302     CeedElemRestriction elem_rstr;
1303 
1304     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1305     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1306     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1307     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
1308     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1309   }
1310   for (CeedInt i = 0; i < num_output_fields; i++) {
1311     CeedInt             num_comp, elem_size;
1312     CeedElemRestriction elem_rstr;
1313 
1314     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1315     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1316     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1317     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
1318     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1319   }
1320   code << "    // Scratch restriction buffer space\n";
1321   code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1322 
1323   // -- Determine best input field processing order
1324   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1325 
1326   for (CeedInt i = 0; i < num_input_fields; i++) {
1327     field_rstr_in_buffer[i] = -1;
1328     input_field_order[i]    = -1;
1329   }
1330   {
1331     bool    is_ordered[CEED_FIELD_MAX];
1332     CeedInt curr_index = 0;
1333 
1334     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1335     for (CeedInt i = 0; i < num_input_fields; i++) {
1336       CeedVector          vec_i;
1337       CeedElemRestriction rstr_i;
1338 
1339       if (is_ordered[i]) continue;
1340       field_rstr_in_buffer[i]       = i;
1341       is_ordered[i]                 = true;
1342       input_field_order[curr_index] = i;
1343       curr_index++;
1344       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1345       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1346       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1347       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1348         CeedVector          vec_j;
1349         CeedElemRestriction rstr_j;
1350 
1351         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1352         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1353         if (rstr_i == rstr_j && vec_i == vec_j) {
1354           field_rstr_in_buffer[j]       = i;
1355           is_ordered[j]                 = true;
1356           input_field_order[curr_index] = j;
1357           curr_index++;
1358         }
1359         CeedCallBackend(CeedVectorDestroy(&vec_j));
1360         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1361       }
1362       CeedCallBackend(CeedVectorDestroy(&vec_i));
1363       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1364     }
1365   }
1366 
1367   // -- Input restriction and basis
1368   code << "\n    // -- Input field restrictions and basis actions\n";
1369   for (CeedInt i = 0; i < num_input_fields; i++) {
1370     CeedInt f = input_field_order[i];
1371 
1372     code << "    // ---- Input field " << f << "\n";
1373 
1374     // ---- Restriction
1375     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1376                                                                 Q_1d, true, is_tensor, is_at_points, use_3d_slices));
1377 
1378     // ---- Basis action
1379     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_tensor,
1380                                                           is_at_points, use_3d_slices));
1381   }
1382 
1383   // -- Q function
1384   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
1385                                                             num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_tensor,
1386                                                             is_at_points, use_3d_slices));
1387 
1388   // -- Output basis and restriction
1389   code << "\n    // -- Output field basis action and restrictions\n";
1390   for (CeedInt i = 0; i < num_output_fields; i++) {
1391     code << "    // ---- Output field " << i << "\n";
1392 
1393     // ---- Basis action
1394     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
1395                                                           is_at_points, use_3d_slices));
1396 
1397     // ---- Restriction
1398     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
1399                                                                 is_tensor, is_at_points, use_3d_slices));
1400   }
1401 
1402   // Close loop and function
1403   code << "  }\n";
1404   code << "}\n";
1405   code << "// -----------------------------------------------------------------------------\n\n";
1406 
1407   // Compile
1408   {
1409     bool is_compile_good = false;
1410 
1411     CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
1412     if (is_compile_good) {
1413       *is_good_build = true;
1414       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
1415     } else {
1416       *is_good_build     = false;
1417       data->use_fallback = true;
1418     }
1419   }
1420   CeedCallBackend(CeedOperatorSetSetupDone(op));
1421   CeedCallBackend(CeedDestroy(&ceed));
1422   CeedCallBackend(CeedQFunctionDestroy(&qf));
1423   return CEED_ERROR_SUCCESS;
1424 }
1425 
1426 //------------------------------------------------------------------------------
1427