xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision c433aabc65bbaa1ba98688d834b18c213e9c6270)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/jit-tools.h>
13 #include <cuda_runtime.h>
14 
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 
19 #include "../cuda-ref/ceed-cuda-ref.h"
20 #include "../cuda-shared/ceed-cuda-shared.h"
21 #include "../cuda/ceed-cuda-common.h"
22 #include "../cuda/ceed-cuda-compile.h"
23 #include "ceed-cuda-gen.h"
24 
25 struct FieldReuse_Cuda {
26   CeedInt      index;
27   bool         is_input;
28   CeedEvalMode eval_mode;
29 };
30 
31 //------------------------------------------------------------------------------
32 // Determine type of operator
33 //------------------------------------------------------------------------------
34 static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
35                                                 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
36                                                 CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
37                                                 CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
38   // Check if all are tensor
39   *is_all_tensor = true;
40   for (CeedInt i = 0; i < num_input_fields; i++) {
41     CeedBasis basis;
42 
43     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
44     if (basis != CEED_BASIS_NONE) {
45       bool is_field_tensor;
46 
47       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
48       *is_all_tensor = *is_all_tensor && is_field_tensor;
49     }
50     CeedCallBackend(CeedBasisDestroy(&basis));
51   }
52   for (CeedInt i = 0; i < num_output_fields; i++) {
53     CeedBasis basis;
54 
55     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
56     if (basis != CEED_BASIS_NONE) {
57       bool is_field_tensor;
58 
59       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
60       *is_all_tensor = *is_all_tensor && is_field_tensor;
61     }
62     CeedCallBackend(CeedBasisDestroy(&basis));
63   }
64 
65   // Find max_P, max_P_1d, Q, and Q_1d
66   bool is_all_3d = true;
67 
68   *max_P    = 0;
69   *max_P_1d = 0;
70   *Q        = 0;
71   *Q_1d     = 0;
72   for (CeedInt i = 0; i < num_input_fields; i++) {
73     CeedBasis basis;
74 
75     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
76     if (basis != CEED_BASIS_NONE) {
77       bool    is_field_tensor;
78       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
79 
80       // Check if 3D
81       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
82       is_all_3d = is_all_3d && (field_dim == 3);
83       *max_dim  = CeedIntMax(*max_dim, field_dim);
84 
85       // Collect P, P_1d, Q, and Q_1d
86       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
87       *max_P = CeedIntMax(*max_P, field_P);
88       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
89       if (is_field_tensor) {
90         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
91         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
92       }
93       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
94       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
95       *Q = field_Q;
96       if (is_field_tensor) {
97         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
98         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
99         *Q_1d = field_Q_1d;
100       }
101     }
102     CeedCallBackend(CeedBasisDestroy(&basis));
103   }
104   for (CeedInt i = 0; i < num_output_fields; i++) {
105     CeedBasis basis;
106 
107     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
108     if (basis != CEED_BASIS_NONE) {
109       bool    is_field_tensor;
110       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
111 
112       // Check if 3D
113       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
114       is_all_3d = is_all_3d && (field_dim == 3);
115       *max_dim  = CeedIntMax(*max_dim, field_dim);
116 
117       // Collect P, P_1d, Q, and Q_1d
118       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
119       *max_P = CeedIntMax(*max_P, field_P);
120       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
121       if (is_field_tensor) {
122         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
123         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
124       }
125       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
126       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
127       *Q = field_Q;
128       if (is_field_tensor) {
129         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
130         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
131         *Q_1d = field_Q_1d;
132       }
133     }
134     CeedCallBackend(CeedBasisDestroy(&basis));
135   }
136 
137   // Only use 3D collocated gradient parallelization strategy when gradient is computed
138   *use_3d_slices = false;
139   if (is_all_3d && *is_all_tensor) {
140     bool was_grad_found = false;
141 
142     for (CeedInt i = 0; i < num_input_fields; i++) {
143       CeedEvalMode eval_mode;
144 
145       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
146       if (eval_mode == CEED_EVAL_GRAD) {
147         CeedBasis_Cuda_shared *basis_data;
148         CeedBasis              basis;
149 
150         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
151         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
152         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
153         was_grad_found = true;
154         CeedCallBackend(CeedBasisDestroy(&basis));
155       }
156     }
157     for (CeedInt i = 0; i < num_output_fields; i++) {
158       CeedEvalMode eval_mode;
159 
160       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
161       if (eval_mode == CEED_EVAL_GRAD) {
162         CeedBasis_Cuda_shared *basis_data;
163         CeedBasis              basis;
164 
165         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
166         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
167         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
168         was_grad_found = true;
169         CeedCallBackend(CeedBasisDestroy(&basis));
170       }
171     }
172   }
173   return CEED_ERROR_SUCCESS;
174 }
175 
176 //------------------------------------------------------------------------------
177 // Setup fields
178 //------------------------------------------------------------------------------
179 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
180                                                      CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt Q, CeedInt Q_1d, bool is_input,
181                                                      bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
182   bool      is_tensor = true;
183   CeedBasis basis;
184   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
185   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
186 
187   const char            *field_name;
188   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
189   std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
190   std::string            option_name = (is_input ? "inputs" : "outputs");
191   CeedEvalMode           eval_mode   = CEED_EVAL_NONE;
192   CeedInt                elem_size = 0, num_comp = 0, dim = 1, P_1d = 0;
193   CeedElemRestriction    elem_rstr;
194   CeedBasis_Cuda_shared *basis_data;
195 
196   // Field reuse info
197   bool use_previous_field = field_reuse.index != -1;
198 
199   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
200   code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
201 
202   // Get field data
203   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
204   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
205     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
206     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
207   }
208   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
209   if (basis != CEED_BASIS_NONE) {
210     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
211     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
212     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
213     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
214   }
215   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
216 
217   // Set field constants
218   code << "  const CeedInt dim" << var_suffix << " = " << dim << ";\n";
219   if (is_tensor && !is_all_tensor) {
220     CeedInt P = 0;
221 
222     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
223     code << "  const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
224   }
225   code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
226   if (eval_mode != CEED_EVAL_WEIGHT) {
227     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
228   }
229 
230   // Load basis data
231   code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
232   switch (eval_mode) {
233     case CEED_EVAL_NONE:
234       break;
235     case CEED_EVAL_INTERP:
236       if (is_at_points) {
237         // AtPoints
238         if (!basis_data->d_chebyshev_interp_1d) {
239           CeedSize    interp_bytes;
240           CeedScalar *chebyshev_interp_1d;
241 
242           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
243           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
244           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
245           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
246           CeedCallCuda(CeedBasisReturnCeed(basis),
247                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
248           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
249         }
250         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
251         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
252       } else {
253         // Standard quadrature
254         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
255         else data->B.outputs[i] = basis_data->d_interp_1d;
256       }
257       if (use_previous_field) {
258         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
259 
260         code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
261       } else {
262         code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
263         code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
264       }
265       break;
266     case CEED_EVAL_GRAD:
267       if (is_at_points) {
268         // AtPoints
269         if (!basis_data->d_chebyshev_interp_1d) {
270           CeedSize    interp_bytes;
271           CeedScalar *chebyshev_interp_1d;
272 
273           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
274           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
275           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
276           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
277           CeedCallCuda(CeedBasisReturnCeed(basis),
278                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
279           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
280         }
281         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
282         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
283       } else {
284         // Standard quadrature
285         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
286         else data->B.outputs[i] = basis_data->d_interp_1d;
287       }
288       if (is_tensor) {
289         if (use_previous_field) {
290           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
291 
292           code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
293         } else {
294           code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
295           code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
296         }
297       }
298       if (is_at_points) break;  // No G mat for AtPoints
299       if (use_3d_slices) {
300         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
301         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
302         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
303           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
304 
305           code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
306         } else {
307           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
308           code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
309         }
310       } else {
311         bool has_collo_grad = basis_data->d_collo_grad_1d;
312 
313         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
314         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
315         if (has_collo_grad) {
316           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
317             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
318 
319             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
320           } else {
321             code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
322             code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
323           }
324         } else {
325           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
326             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
327 
328             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
329           } else {
330             code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
331                  << (is_tensor ? "" : var_suffix) << "];\n";
332             code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
333                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
334           }
335         }
336       }
337       break;
338     case CEED_EVAL_WEIGHT:
339       break;  // No action
340       // LCOV_EXCL_START
341     case CEED_EVAL_DIV:
342     case CEED_EVAL_CURL:
343       break;  // TODO: Not implemented
344               // LCOV_EXCL_STOP
345   }
346   CeedCallBackend(CeedBasisDestroy(&basis));
347   return CEED_ERROR_SUCCESS;
348 }
349 
350 //------------------------------------------------------------------------------
351 // Restriction
352 //------------------------------------------------------------------------------
353 static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt max_dim,
354                                                        CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
355                                                        CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
356   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
357   std::string               P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
358   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
359   CeedInt                   elem_size = 0, num_comp = 0;
360   CeedSize                  l_size;
361   CeedRestrictionType       rstr_type = CEED_RESTRICTION_STANDARD;
362   CeedElemRestriction_Cuda *rstr_data;
363   CeedElemRestriction       elem_rstr;
364 
365   // Get field data
366   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
367   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
368     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
369     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
370     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
371     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
372   }
373   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
374 
375   // Restriction
376   if (is_input) {
377     // Input
378     if (field_input_buffer[i] != i) {
379       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
380 
381       // Restriction was already done for previous input
382       code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
383     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
384       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
385         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
386         code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
387       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
388         // Otherwise we're using the scratch space
389         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
390       }
391       switch (rstr_type) {
392         case CEED_RESTRICTION_STANDARD: {
393           CeedInt comp_stride;
394 
395           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
396           code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
397           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
398           code << "    // CompStride: " << comp_stride << "\n";
399           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
400           code << "    ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
401                << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
402           break;
403         }
404         case CEED_RESTRICTION_STRIDED: {
405           bool    has_backend_strides;
406           CeedInt num_elem;
407 
408           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
409           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
410           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
411 
412           if (!has_backend_strides) {
413             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
414           }
415           code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
416           code << "    ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
417                << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
418           break;
419         }
420         case CEED_RESTRICTION_POINTS: {
421           CeedInt comp_stride;
422 
423           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
424           code << "    const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
425           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
426           break;
427         }
428         // LCOV_EXCL_START
429         case CEED_RESTRICTION_ORIENTED:
430         case CEED_RESTRICTION_CURL_ORIENTED:
431           break;  // TODO: Not implemented
432                   // LCOV_EXCL_STOP
433       }
434     }
435   } else {
436     // Output
437     switch (rstr_type) {
438       case CEED_RESTRICTION_STANDARD: {
439         CeedInt comp_stride;
440 
441         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
442         code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
443         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
444         code << "    // CompStride: " << comp_stride << "\n";
445         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
446         code << "    WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
447              << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
448         break;
449       }
450       case CEED_RESTRICTION_STRIDED: {
451         bool    has_backend_strides;
452         CeedInt num_elem;
453 
454         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
455         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
456         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
457 
458         if (!has_backend_strides) {
459           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
460         }
461         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
462         code << "    WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
463              << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
464         break;
465       }
466       case CEED_RESTRICTION_POINTS:
467         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
468         break;
469       // LCOV_EXCL_START
470       case CEED_RESTRICTION_ORIENTED:
471       case CEED_RESTRICTION_CURL_ORIENTED:
472         break;  // TODO: Not implemented
473                 // LCOV_EXCL_STOP
474     }
475   }
476   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
477   return CEED_ERROR_SUCCESS;
478 }
479 
480 //------------------------------------------------------------------------------
481 // Basis
482 //------------------------------------------------------------------------------
483 static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt max_dim,
484                                                  CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
485                                                  bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
486   bool      is_tensor = true;
487   CeedBasis basis;
488   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
489   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
490 
491   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
492   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
493   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
494   CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
495   CeedElemRestriction elem_rstr;
496 
497   // Get field data
498   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
499   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
500     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
501     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
502   }
503   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
504   if (basis != CEED_BASIS_NONE) {
505     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
506     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
507     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
508   }
509   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
510 
511   // Basis
512   code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
513   if (is_input) {
514     switch (eval_mode) {
515       case CEED_EVAL_NONE:
516         if (!use_3d_slices && !is_at_points) {
517           code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
518         }
519         break;
520       case CEED_EVAL_INTERP:
521         if (is_at_points) {
522           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
523 
524           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
525           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
526                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
527         } else {
528           std::string function_name = is_tensor
529                                           ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
530                                           : "InterpNonTensor";
531           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
532 
533           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
534           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
535                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
536         }
537         break;
538       case CEED_EVAL_GRAD:
539         if (is_at_points) {
540           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
541 
542           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
543           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
544                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
545         } else if (use_3d_slices) {
546           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
547 
548           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
549           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
550                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
551         } else if (is_tensor) {
552           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
553           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
554                                       (is_all_tensor ? "" : "Flattened");
555           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
556 
557           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" << (dim >= 3 ? Q_name : "1")
558                << "];\n";
559           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
560                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
561         } else {
562           std::string function_name = "GradNonTensor";
563 
564           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
565           code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
566                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
567         }
568         break;
569       case CEED_EVAL_WEIGHT: {
570         if (is_at_points) {
571           code << "    // Nothing to do AtPoints\n";
572         } else {
573           CeedBasis_Cuda_shared *basis_data;
574           std::string            function_name = is_tensor
575                                                      ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
576                                                      : "WeightNonTensor";
577 
578           code << "    CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
579           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
580           data->W = basis_data->d_q_weight_1d;
581           code << "    " << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
582         }
583         break;
584       }
585       // LCOV_EXCL_START
586       case CEED_EVAL_DIV:
587       case CEED_EVAL_CURL:
588         break;  // TODO: Not implemented
589                 // LCOV_EXCL_STOP
590     }
591   } else {
592     switch (eval_mode) {
593       case CEED_EVAL_NONE:
594         code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
595         break;  // No action
596       case CEED_EVAL_INTERP:
597         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
598         if (is_at_points) {
599           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
600 
601           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
602                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
603         } else {
604           std::string function_name =
605               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
606                         : "InterpTransposeNonTensor";
607           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
608 
609           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
610                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
611         }
612         break;
613       case CEED_EVAL_GRAD:
614         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
615         if (is_at_points) {
616           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
617 
618           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
619                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
620         } else if (use_3d_slices) {
621           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
622 
623           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
624                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
625         } else if (is_tensor) {
626           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
627           std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
628                                       std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
629           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
630 
631           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
632                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
633         } else {
634           std::string function_name = "GradTransposeNonTensor";
635 
636           code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
637                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
638         }
639         break;
640       // LCOV_EXCL_START
641       case CEED_EVAL_WEIGHT:
642         break;  // Should not occur
643       case CEED_EVAL_DIV:
644       case CEED_EVAL_CURL:
645         break;  // TODO: Not implemented
646                 // LCOV_EXCL_STOP
647     }
648   }
649   CeedCallBackend(CeedBasisDestroy(&basis));
650   return CEED_ERROR_SUCCESS;
651 }
652 
653 //------------------------------------------------------------------------------
654 // QFunction
655 //------------------------------------------------------------------------------
656 static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt max_dim, CeedInt max_num_points,
657                                                      CeedInt num_input_fields, CeedOperatorField *op_input_fields,
658                                                      CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
659                                                      CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
660                                                      std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
661                                                      bool use_3d_slices) {
662   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
663   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
664   CeedElemRestriction elem_rstr;
665 
666   // Setup output arrays
667   code << "\n    // -- Output field setup\n";
668   for (CeedInt i = 0; i < num_output_fields; i++) {
669     const char *field_name;
670     std::string var_suffix = "_out_" + std::to_string(i);
671 
672     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
673     code << "    // ---- Output field " << i << ": " << field_name << "\n";
674     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
675     switch (eval_mode) {
676       case CEED_EVAL_NONE:
677         if (is_at_points) {
678           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
679         } else {
680           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
681                << "];\n";
682         }
683         break;
684       case CEED_EVAL_INTERP:
685         if (is_at_points) {
686           // Accumulator for point data
687           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
688           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
689           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
690           code << "    }\n";
691         } else {
692           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
693                << "];\n";
694         }
695         break;
696       case CEED_EVAL_GRAD:
697         if (is_at_points) {
698           // Accumulator for point data
699           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "*dim" << var_suffix
700                << "];\n";
701           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
702           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
703           code << "    }\n";
704         } else if (use_3d_slices) {
705           // Accumulator for gradient slices
706           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
707           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
708           code << "      r_q" << var_suffix << "[i] = 0.0;\n";
709           code << "    }\n";
710         } else {
711           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
712                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
713         }
714         break;
715       case CEED_EVAL_WEIGHT:
716         break;
717         // LCOV_EXCL_START
718       case CEED_EVAL_DIV:
719       case CEED_EVAL_CURL:
720         break;  // TODO: Not implemented
721                 // LCOV_EXCL_STOP
722     }
723   }
724 
725   if (is_at_points) {
726     // We need to handle batches of points
727     code << "\n    // Note: Using batches of points\n";
728     code << "    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));\n\n";
729     code << "    #pragma unroll\n";
730     code << "    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {\n";
731     code << "      const CeedInt p = i % max_num_points;\n\n";
732 
733     code << "      // -- Coordinates\n";
734     code << "      CeedScalar r_x[max_dim];\n";
735     code << "      ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
736 
737     code << "      // -- Input fields\n";
738     for (CeedInt i = 0; i < num_input_fields; i++) {
739       const char *field_name;
740       std::string var_suffix = "_in_" + std::to_string(i);
741       std::string P_name     = "P_1d" + var_suffix;
742 
743       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
744       code << "      // ---- Input field " << i << ": " << field_name << "\n";
745       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
746       // Basis action
747       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
748       switch (eval_mode) {
749         case CEED_EVAL_NONE:
750           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
751           code << "      ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
752                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
753           break;
754         case CEED_EVAL_INTERP:
755           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
756           code << "      InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
757                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
758           break;
759         case CEED_EVAL_GRAD:
760           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
761           code << "      GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
762                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
763           break;
764         case CEED_EVAL_WEIGHT:
765           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
766           code << "      r_s" << var_suffix << "[0] = 1.0;\n";
767           break;
768           // LCOV_EXCL_START
769         case CEED_EVAL_DIV:
770         case CEED_EVAL_CURL:
771           break;  // TODO: Not implemented
772                   // LCOV_EXCL_STOP
773       }
774     }
775     code << "\n      // -- Output fields\n";
776     for (CeedInt i = 0; i < num_output_fields; i++) {
777       const char *field_name;
778       std::string var_suffix = "_out_" + std::to_string(i);
779 
780       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
781       code << "      // ---- Output field " << i << ": " << field_name << "\n";
782       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
783       // Basis action
784       switch (eval_mode) {
785         case CEED_EVAL_NONE:
786           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
787           break;
788         case CEED_EVAL_INTERP:
789           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
790           break;
791         case CEED_EVAL_GRAD:
792           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
793           break;
794           // LCOV_EXCL_START
795         case CEED_EVAL_WEIGHT:
796           break;  // Should not occur
797         case CEED_EVAL_DIV:
798         case CEED_EVAL_CURL:
799           break;  // TODO: Not implemented
800                   // LCOV_EXCL_STOP
801       }
802     }
803 
804   } else if (use_3d_slices) {
805     // We treat quadrature points per slice in 3d to save registers
806     code << "\n    // Note: Using planes of 3D elements\n";
807     code << "    #pragma unroll\n";
808     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
809     code << "      // -- Input fields\n";
810     for (CeedInt i = 0; i < num_input_fields; i++) {
811       const char *field_name;
812       std::string var_suffix = "_in_" + std::to_string(i);
813 
814       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
815       code << "      // ---- Input field " << i << ": " << field_name << "\n";
816       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
817       // Basis action
818       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
819       switch (eval_mode) {
820         case CEED_EVAL_NONE:
821           bool is_strided;
822 
823           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
824 
825           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
826           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
827           if (is_strided) {
828             bool    has_backend_strides;
829             CeedInt num_elem, elem_size;
830 
831             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
832             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
833             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
834             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
835 
836             if (!has_backend_strides) {
837               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
838             }
839             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
840             code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", " << strides[0] << ", " << strides[1] << ", "
841                  << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
842           } else {
843             CeedSize                  l_size = 0;
844             CeedInt                   comp_stride;
845             CeedElemRestriction_Cuda *rstr_data;
846 
847             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
848             code << "      const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
849             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
850             code << "      // CompStride: " << comp_stride << "\n";
851             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
852             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
853             code << "      ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
854                  << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
855           }
856           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
857           break;
858         case CEED_EVAL_INTERP:
859           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
860           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
861           code << "        r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
862           code << "      }\n";
863           break;
864         case CEED_EVAL_GRAD:
865           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
866           code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
867                << var_suffix << ", r_s" << var_suffix << ");\n";
868           break;
869         case CEED_EVAL_WEIGHT:
870           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
871           code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
872           break;
873           // LCOV_EXCL_START
874         case CEED_EVAL_DIV:
875         case CEED_EVAL_CURL:
876           break;  // TODO: Not implemented
877                   // LCOV_EXCL_STOP
878       }
879     }
880     code << "\n      // -- Output fields\n";
881     for (CeedInt i = 0; i < num_output_fields; i++) {
882       const char *field_name;
883       std::string var_suffix = "_out_" + std::to_string(i);
884 
885       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
886       code << "      // ---- Output field " << i << ": " << field_name << "\n";
887       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
888       // Basis action
889       switch (eval_mode) {
890         case CEED_EVAL_NONE:
891           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
892           break;
893         case CEED_EVAL_INTERP:
894           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
895           break;
896         case CEED_EVAL_GRAD:
897           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
898           break;
899           // LCOV_EXCL_START
900         case CEED_EVAL_WEIGHT:
901           break;  // Should not occur
902         case CEED_EVAL_DIV:
903         case CEED_EVAL_CURL:
904           break;  // TODO: Not implemented
905                   // LCOV_EXCL_STOP
906       }
907     }
908   } else {
909     code << "\n    // Note: Using full elements\n";
910     code << "    {\n";
911     code << "      // -- Input fields\n";
912     for (CeedInt i = 0; i < num_input_fields; i++) {
913       const char *field_name;
914 
915       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
916       code << "      // ---- Input field " << i << ": " << field_name << "\n";
917       code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
918     }
919     code << "      // -- Output fields\n";
920     for (CeedInt i = 0; i < num_output_fields; i++) {
921       const char *field_name;
922 
923       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
924       code << "      // ---- Output field " << i << ": " << field_name << "\n";
925       code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
926     }
927   }
928 
929   // Input and output buffers
930   code << "\n      // -- QFunction inputs and outputs\n";
931   code << "      // ---- Inputs\n";
932   code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
933   for (CeedInt i = 0; i < num_input_fields; i++) {
934     const char *field_name;
935 
936     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
937     code << "      // ------ Input field " << i << ": " << field_name << "\n";
938     code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
939   }
940   code << "      // ---- Outputs\n";
941   code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
942   for (CeedInt i = 0; i < num_output_fields; i++) {
943     const char *field_name;
944 
945     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
946     code << "      // ------ Output field " << i << ": " << field_name << "\n";
947     code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
948   }
949 
950   // Apply QFunction
951   code << "\n      // -- Apply QFunction\n";
952   code << "      " << qfunction_name << "(ctx, ";
953   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
954     code << "1";
955   } else {
956     code << Q_name;
957   }
958   code << ", inputs, outputs);\n";
959 
960   if (is_at_points) {
961     // Map back to coefficients
962     code << "\n      // -- Output fields\n";
963     for (CeedInt i = 0; i < num_output_fields; i++) {
964       const char *field_name;
965       std::string var_suffix = "_out_" + std::to_string(i);
966       std::string P_name     = "P_1d" + var_suffix;
967 
968       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
969       code << "      // ---- Output field " << i << ": " << field_name << "\n";
970       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
971       // Basis action
972       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
973       switch (eval_mode) {
974         case CEED_EVAL_NONE: {
975           CeedInt             comp_stride;
976           CeedElemRestriction elem_rstr;
977 
978           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
979           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
980           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
981           code << "      const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
982           code << "      WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
983                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
984                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
985           break;
986         }
987         case CEED_EVAL_INTERP:
988           code << "      if (i >= points.num_per_elem[elem]) {\n";
989           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
990           code << "      }\n";
991           code << "      InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
992                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
993           break;
994         case CEED_EVAL_GRAD:
995           code << "      if (i >= points.num_per_elem[elem]) {\n";
996           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
997           code << "      }\n";
998           code << "      GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
999                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
1000           break;
1001           // LCOV_EXCL_START
1002         case CEED_EVAL_WEIGHT:
1003           break;  // Should not occur
1004         case CEED_EVAL_DIV:
1005         case CEED_EVAL_CURL:
1006           break;  // TODO: Not implemented
1007                   // LCOV_EXCL_STOP
1008       }
1009     }
1010   } else if (use_3d_slices) {
1011     // Copy or apply transpose grad, if needed
1012     code << "\n      // -- Output fields\n";
1013     for (CeedInt i = 0; i < num_output_fields; i++) {
1014       const char *field_name;
1015       std::string var_suffix = "_out_" + std::to_string(i);
1016       std::string P_name     = "P_1d" + var_suffix;
1017 
1018       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1019       code << "      // ---- Output field " << i << ": " << field_name << "\n";
1020       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1021       // Basis action
1022       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1023       switch (eval_mode) {
1024         case CEED_EVAL_NONE:
1025           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1026           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1027           code << "      }\n";
1028           break;
1029         case CEED_EVAL_INTERP:
1030           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1031           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1032           code << "      }\n";
1033           break;
1034         case CEED_EVAL_GRAD:
1035           code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1036                << var_suffix << ", r_q" << var_suffix << ");\n";
1037           break;
1038           // LCOV_EXCL_START
1039         case CEED_EVAL_WEIGHT:
1040           break;  // Should not occur
1041         case CEED_EVAL_DIV:
1042         case CEED_EVAL_CURL:
1043           break;  // TODO: Not implemented
1044                   // LCOV_EXCL_STOP
1045       }
1046     }
1047   }
1048   code << "    }\n";
1049   return CEED_ERROR_SUCCESS;
1050 }
1051 
1052 //------------------------------------------------------------------------------
1053 // Build single operator kernel
1054 //------------------------------------------------------------------------------
1055 extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
1056   bool                    is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
1057   Ceed                    ceed;
1058   CeedInt                 Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1059   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
1060   CeedQFunction_Cuda_gen *qf_data;
1061   CeedQFunction           qf;
1062   CeedOperatorField      *op_input_fields, *op_output_fields;
1063   CeedOperator_Cuda_gen  *data;
1064   std::ostringstream      code;
1065 
1066   CeedCallBackend(CeedOperatorGetData(op, &data));
1067   {
1068     bool is_setup_done;
1069 
1070     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
1071     if (is_setup_done) {
1072       *is_good_build = !data->use_fallback;
1073       return CEED_ERROR_SUCCESS;
1074     }
1075   }
1076 
1077   // Check field compatibility
1078   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1079   {
1080     bool has_shared_bases = true;
1081 
1082     for (CeedInt i = 0; i < num_input_fields; i++) {
1083       CeedBasis basis;
1084 
1085       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1086       if (basis != CEED_BASIS_NONE) {
1087         bool        is_tensor = true;
1088         const char *resource;
1089         char       *resource_root;
1090         Ceed        basis_ceed;
1091 
1092         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1093         is_all_tensor    = is_all_tensor && is_tensor;
1094         is_all_nontensor = is_all_nontensor && !is_tensor;
1095         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1096         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1097         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1098         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
1099         CeedCallBackend(CeedFree(&resource_root));
1100         CeedCallBackend(CeedDestroy(&basis_ceed));
1101       }
1102       CeedCallBackend(CeedBasisDestroy(&basis));
1103     }
1104 
1105     for (CeedInt i = 0; i < num_output_fields; i++) {
1106       CeedBasis basis;
1107 
1108       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1109       if (basis != CEED_BASIS_NONE) {
1110         bool        is_tensor = true;
1111         const char *resource;
1112         char       *resource_root;
1113         Ceed        basis_ceed;
1114 
1115         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1116         is_all_tensor    = is_all_tensor && is_tensor;
1117         is_all_nontensor = is_all_nontensor && !is_tensor;
1118 
1119         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1120         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1121         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1122         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
1123         CeedCallBackend(CeedFree(&resource_root));
1124         CeedCallBackend(CeedDestroy(&basis_ceed));
1125       }
1126       CeedCallBackend(CeedBasisDestroy(&basis));
1127     }
1128     // -- Fallback to ref if not all bases are shared
1129     if (!has_shared_bases) {
1130       *is_good_build = false;
1131       return CEED_ERROR_SUCCESS;
1132     }
1133   }
1134   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1135   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1136   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1137   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1138 
1139   // Get operator data
1140   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1141   {
1142     CeedInt max_P, max_P_1d;
1143 
1144     CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
1145                                                          op_output_fields, qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor,
1146                                                          &use_3d_slices));
1147     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
1148   }
1149   if (max_dim == 0) max_dim = 1;
1150   data->dim = max_dim;
1151   if (is_at_points) {
1152     CeedElemRestriction_Cuda *rstr_data;
1153     CeedElemRestriction       rstr_points = NULL;
1154 
1155     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1156     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1157     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1158     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1159     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1160     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1161   }
1162   if (is_at_points) use_3d_slices = false;
1163   if (Q_1d == 0) {
1164     if (is_at_points) Q_1d = max_num_points;
1165     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1166   }
1167   if (Q == 0) Q = Q_1d;
1168   data->Q    = Q;
1169   data->Q_1d = Q_1d;
1170 
1171   // Check for restriction only identity operator
1172   {
1173     bool is_identity_qf;
1174 
1175     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1176     if (is_identity_qf) {
1177       CeedEvalMode eval_mode_in, eval_mode_out;
1178 
1179       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1180       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1181       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1182                 "Backend does not implement restriction only identity operators");
1183     }
1184   }
1185 
1186   // Add atomicAdd function for old NVidia architectures
1187   {
1188     Ceed_Cuda            *ceed_data;
1189     struct cudaDeviceProp prop;
1190 
1191     CeedCallBackend(CeedGetData(ceed, &ceed_data));
1192     CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
1193     if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
1194       code << "// AtomicAdd fallback source\n";
1195       code << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
1196     }
1197   }
1198 
1199   // Load basis source files
1200   if (!is_all_nontensor) {
1201     code << "// Tensor basis source\n";
1202     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
1203   }
1204   if (!is_all_tensor) {
1205     code << "// Non-tensor basis source\n";
1206     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
1207   }
1208   if (is_at_points) {
1209     code << "// AtPoints basis source\n";
1210     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
1211   }
1212   code << "// CodeGen operator source\n";
1213   code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
1214 
1215   // Get QFunction name
1216   std::string qfunction_name(qf_data->qfunction_name);
1217   std::string operator_name;
1218 
1219   operator_name = "CeedKernelCudaGenOperator_" + qfunction_name;
1220 
1221   // Define CEED_Q_VLA
1222   code << "\n#undef CEED_Q_VLA\n";
1223   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1224     code << "#define CEED_Q_VLA 1\n\n";
1225   } else {
1226     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1227   }
1228 
1229   // Add user QFunction source
1230   {
1231     const char *source_path;
1232 
1233     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1234     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
1235 
1236     code << "// User QFunction source\n";
1237     code << "#include \"" << source_path << "\"\n\n";
1238   }
1239 
1240   // Setup
1241   code << "\n// -----------------------------------------------------------------------------\n";
1242   code << "// Operator Kernel\n";
1243   code << "// \n";
1244   code << "// d_[in,out]_i:   CeedVector device array\n";
1245   code << "// r_[in,out]_e_i: Element vector register\n";
1246   code << "// r_[in,out]_q_i: Quadrature space vector register\n";
1247   code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1248   code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1249   code << "// \n";
1250   code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1251   code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1252   code << "// -----------------------------------------------------------------------------\n";
1253   code << "extern \"C\" __global__ void " << operator_name
1254        << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
1255           "points) {\n";
1256 
1257   // Scratch buffers
1258   for (CeedInt i = 0; i < num_input_fields; i++) {
1259     CeedEvalMode eval_mode;
1260 
1261     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1262     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1263       code << "  const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1264     }
1265   }
1266   for (CeedInt i = 0; i < num_output_fields; i++) {
1267     code << "  CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1268   }
1269 
1270   code << "  const CeedInt max_dim = " << max_dim << ";\n";
1271   if (!is_all_tensor) {
1272     code << "  const CeedInt Q = " << Q << ";\n";
1273   }
1274   if (!is_all_nontensor) {
1275     code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
1276   }
1277   if (is_at_points) {
1278     code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
1279     code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1280   }
1281 
1282   // Shared data
1283   code << "  extern __shared__ CeedScalar slice[];\n";
1284   code << "  SharedData_Cuda data;\n";
1285   code << "  data.t_id_x = threadIdx.x;\n";
1286   code << "  data.t_id_y = threadIdx.y;\n";
1287   code << "  data.t_id_z = threadIdx.z;\n";
1288   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1289   code << "  data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1290 
1291   // -- Determine input mat reuse
1292   FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
1293 
1294   for (CeedInt i = 0; i < num_input_fields; i++) {
1295     input_matrix_reuse[i].index = -1;
1296   }
1297   for (CeedInt i = 0; i < num_input_fields; i++) {
1298     bool         is_tensor = true;
1299     CeedEvalMode eval_mode_i;
1300     CeedBasis    basis_i;
1301 
1302     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1303     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1304     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1305     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1306     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1307       CeedEvalMode eval_mode_j;
1308       CeedBasis    basis_j;
1309 
1310       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1311       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1312       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1313       if (basis_i == basis_j) {
1314         if (is_tensor) {
1315           input_matrix_reuse[i].index     = j;
1316           input_matrix_reuse[i].is_input  = true;
1317           input_matrix_reuse[i].eval_mode = eval_mode_j;
1318         } else {
1319           // For non-tensor can only re-use with the same eval mode
1320           if (eval_mode_i == eval_mode_j) {
1321             input_matrix_reuse[i].index     = j;
1322             input_matrix_reuse[i].is_input  = true;
1323             input_matrix_reuse[i].eval_mode = eval_mode_j;
1324           }
1325         }
1326       }
1327       CeedCallBackend(CeedBasisDestroy(&basis_j));
1328     }
1329     CeedCallBackend(CeedBasisDestroy(&basis_i));
1330   }
1331 
1332   // -- Determine output mat reuse
1333   FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX];
1334 
1335   for (CeedInt i = 0; i < num_output_fields; i++) {
1336     output_matrix_reuse[i].index = -1;
1337   }
1338   for (CeedInt i = 0; i < num_output_fields; i++) {
1339     bool         is_tensor = true;
1340     CeedEvalMode eval_mode_i;
1341     CeedBasis    basis_i;
1342 
1343     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1344     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1345     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1346     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1347       CeedEvalMode eval_mode_j;
1348       CeedBasis    basis_j;
1349 
1350       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1351       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1352       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1353       if (basis_i == basis_j) {
1354         if (is_tensor) {
1355           output_matrix_reuse[i].index     = j;
1356           output_matrix_reuse[i].is_input  = true;
1357           output_matrix_reuse[i].eval_mode = eval_mode_j;
1358         } else {
1359           // For non-tensor can only re-use with the same eval mode
1360           if (eval_mode_i == eval_mode_j) {
1361             output_matrix_reuse[i].index     = j;
1362             output_matrix_reuse[i].is_input  = true;
1363             output_matrix_reuse[i].eval_mode = eval_mode_j;
1364           }
1365         }
1366       }
1367       CeedCallBackend(CeedBasisDestroy(&basis_j));
1368     }
1369     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1370       CeedEvalMode eval_mode_j;
1371       CeedBasis    basis_j;
1372 
1373       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1374       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1375       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1376       if (basis_i == basis_j) {
1377         if (is_tensor) {
1378           output_matrix_reuse[i].index     = j;
1379           output_matrix_reuse[i].is_input  = false;
1380           output_matrix_reuse[i].eval_mode = eval_mode_j;
1381         } else {
1382           // For non-tensor can only re-use with the same eval mode
1383           if (eval_mode_i == eval_mode_j) {
1384             output_matrix_reuse[i].index     = j;
1385             output_matrix_reuse[i].is_input  = false;
1386             output_matrix_reuse[i].eval_mode = eval_mode_j;
1387           }
1388         }
1389       }
1390       CeedCallBackend(CeedBasisDestroy(&basis_j));
1391     }
1392     CeedCallBackend(CeedBasisDestroy(&basis_i));
1393   }
1394 
1395   // Initialize constants, and matrices B and G
1396   code << "\n  // Input field constants and basis data\n";
1397   for (CeedInt i = 0; i < num_input_fields; i++) {
1398     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q, Q_1d,
1399                                                               true, is_all_tensor, is_at_points, use_3d_slices));
1400   }
1401   code << "\n  // Output field constants and basis data\n";
1402   for (CeedInt i = 0; i < num_output_fields; i++) {
1403     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q,
1404                                                               Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1405   }
1406 
1407   // Loop over all elements
1408   code << "\n  // Element loop\n";
1409   code << "  __syncthreads();\n";
1410   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1411 
1412   // -- Compute minimum buffer space needed
1413   CeedInt max_rstr_buffer_size = 1;
1414 
1415   for (CeedInt i = 0; i < num_input_fields; i++) {
1416     CeedInt             num_comp, elem_size;
1417     CeedElemRestriction elem_rstr;
1418 
1419     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1420     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1421     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1422     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? elem_size : 1));
1423     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1424   }
1425   for (CeedInt i = 0; i < num_output_fields; i++) {
1426     CeedInt             num_comp, elem_size;
1427     CeedElemRestriction elem_rstr;
1428 
1429     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1430     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1431     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1432     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? elem_size : 1));
1433     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1434   }
1435   code << "    // Scratch restriction buffer space\n";
1436   code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1437 
1438   // -- Determine best input field processing order
1439   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1440 
1441   for (CeedInt i = 0; i < num_input_fields; i++) {
1442     field_rstr_in_buffer[i] = -1;
1443     input_field_order[i]    = -1;
1444   }
1445   {
1446     bool    is_ordered[CEED_FIELD_MAX];
1447     CeedInt curr_index = 0;
1448 
1449     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1450     for (CeedInt i = 0; i < num_input_fields; i++) {
1451       CeedVector          vec_i;
1452       CeedElemRestriction rstr_i;
1453 
1454       if (is_ordered[i]) continue;
1455       field_rstr_in_buffer[i]       = i;
1456       is_ordered[i]                 = true;
1457       input_field_order[curr_index] = i;
1458       curr_index++;
1459       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1460       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1461       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1462       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1463         CeedVector          vec_j;
1464         CeedElemRestriction rstr_j;
1465 
1466         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1467         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1468         if (rstr_i == rstr_j && vec_i == vec_j) {
1469           field_rstr_in_buffer[j]       = i;
1470           is_ordered[j]                 = true;
1471           input_field_order[curr_index] = j;
1472           curr_index++;
1473         }
1474         CeedCallBackend(CeedVectorDestroy(&vec_j));
1475         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1476       }
1477       CeedCallBackend(CeedVectorDestroy(&vec_i));
1478       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1479     }
1480   }
1481 
1482   // -- Input restriction and basis
1483   code << "\n    // -- Input field restrictions and basis actions\n";
1484   for (CeedInt i = 0; i < num_input_fields; i++) {
1485     const char   *field_name;
1486     const CeedInt f = input_field_order[i];
1487 
1488     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1489     code << "    // ---- Input field " << f << ": " << field_name << "\n";
1490 
1491     // ---- Restriction
1492     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, max_dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1493                                                                 Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1494 
1495     // ---- Basis action
1496     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, max_dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_all_tensor,
1497                                                           is_at_points, use_3d_slices));
1498   }
1499 
1500   // -- Q function
1501   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, max_dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
1502                                                             num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d,
1503                                                             is_all_tensor, is_at_points, use_3d_slices));
1504 
1505   // -- Output basis and restriction
1506   code << "\n    // -- Output field basis action and restrictions\n";
1507   for (CeedInt i = 0; i < num_output_fields; i++) {
1508     const char *field_name;
1509 
1510     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1511     code << "    // ---- Output field " << i << ": " << field_name << "\n";
1512 
1513     // ---- Basis action
1514     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, max_dim, op_output_fields[i], qf_output_fields[i], Q_1d, false,
1515                                                           is_all_tensor, is_at_points, use_3d_slices));
1516 
1517     // ---- Restriction
1518     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, max_dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
1519                                                                 is_all_tensor, is_at_points, use_3d_slices));
1520   }
1521 
1522   // Close loop and function
1523   code << "  }\n";
1524   code << "}\n";
1525   code << "// -----------------------------------------------------------------------------\n\n";
1526 
1527   // Compile
1528   {
1529     bool          is_compile_good = false;
1530     const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
1531 
1532     CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
1533     if (is_compile_good) {
1534       *is_good_build = true;
1535       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
1536     } else {
1537       *is_good_build     = false;
1538       data->use_fallback = true;
1539     }
1540   }
1541   CeedCallBackend(CeedOperatorSetSetupDone(op));
1542   CeedCallBackend(CeedDestroy(&ceed));
1543   CeedCallBackend(CeedQFunctionDestroy(&qf));
1544   return CEED_ERROR_SUCCESS;
1545 }
1546 
1547 //------------------------------------------------------------------------------
1548