xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 412e56839ae1393ae29619b6fd39ea10cdd89adf)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/jit-tools.h>
13 #include <cuda_runtime.h>
14 
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 
19 #include "../cuda-ref/ceed-cuda-ref.h"
20 #include "../cuda-shared/ceed-cuda-shared.h"
21 #include "../cuda/ceed-cuda-common.h"
22 #include "../cuda/ceed-cuda-compile.h"
23 #include "ceed-cuda-gen.h"
24 
25 struct FieldReuse_Cuda {
26   CeedInt      index;
27   bool         is_input;
28   CeedEvalMode eval_mode;
29 };
30 
31 //------------------------------------------------------------------------------
32 // Determine type of operator
33 //------------------------------------------------------------------------------
34 static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
35                                                 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
36                                                 CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
37                                                 CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
38   // Check if all are tensor
39   *is_all_tensor = true;
40   for (CeedInt i = 0; i < num_input_fields; i++) {
41     CeedBasis basis;
42 
43     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
44     if (basis != CEED_BASIS_NONE) {
45       bool is_field_tensor;
46 
47       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
48       *is_all_tensor = *is_all_tensor && is_field_tensor;
49     }
50     CeedCallBackend(CeedBasisDestroy(&basis));
51   }
52   for (CeedInt i = 0; i < num_output_fields; i++) {
53     CeedBasis basis;
54 
55     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
56     if (basis != CEED_BASIS_NONE) {
57       bool is_field_tensor;
58 
59       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
60       *is_all_tensor = *is_all_tensor && is_field_tensor;
61     }
62     CeedCallBackend(CeedBasisDestroy(&basis));
63   }
64 
65   // Find max_P, max_P_1d, Q, and Q_1d
66   bool is_all_3d = true;
67 
68   *max_P    = 0;
69   *max_P_1d = 0;
70   *Q        = 0;
71   *Q_1d     = 0;
72   for (CeedInt i = 0; i < num_input_fields; i++) {
73     CeedBasis basis;
74 
75     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
76     if (basis != CEED_BASIS_NONE) {
77       bool    is_field_tensor;
78       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
79 
80       // Check if 3D
81       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
82       is_all_3d = is_all_3d && (field_dim == 3);
83       *max_dim  = CeedIntMax(*max_dim, field_dim);
84 
85       // Collect P, P_1d, Q, and Q_1d
86       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
87       *max_P = CeedIntMax(*max_P, field_P);
88       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
89       if (is_field_tensor) {
90         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
91         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
92       }
93       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
94       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
95       *Q = field_Q;
96       if (is_field_tensor) {
97         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
98         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
99         *Q_1d = field_Q_1d;
100       }
101     }
102     CeedCallBackend(CeedBasisDestroy(&basis));
103   }
104   for (CeedInt i = 0; i < num_output_fields; i++) {
105     CeedBasis basis;
106 
107     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
108     if (basis != CEED_BASIS_NONE) {
109       bool    is_field_tensor;
110       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
111 
112       // Check if 3D
113       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
114       is_all_3d = is_all_3d && (field_dim == 3);
115       *max_dim  = CeedIntMax(*max_dim, field_dim);
116 
117       // Collect P, P_1d, Q, and Q_1d
118       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
119       *max_P = CeedIntMax(*max_P, field_P);
120       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
121       if (is_field_tensor) {
122         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
123         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
124       }
125       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
126       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
127       *Q = field_Q;
128       if (is_field_tensor) {
129         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
130         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
131         *Q_1d = field_Q_1d;
132       }
133     }
134     CeedCallBackend(CeedBasisDestroy(&basis));
135   }
136 
137   // Only use 3D collocated gradient parallelization strategy when gradient is computed
138   *use_3d_slices = false;
139   if (is_all_3d && *is_all_tensor) {
140     bool was_grad_found = false;
141 
142     for (CeedInt i = 0; i < num_input_fields; i++) {
143       CeedEvalMode eval_mode;
144 
145       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
146       if (eval_mode == CEED_EVAL_GRAD) {
147         CeedBasis_Cuda_shared *basis_data;
148         CeedBasis              basis;
149 
150         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
151         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
152         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
153         was_grad_found = true;
154         CeedCallBackend(CeedBasisDestroy(&basis));
155       }
156     }
157     for (CeedInt i = 0; i < num_output_fields; i++) {
158       CeedEvalMode eval_mode;
159 
160       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
161       if (eval_mode == CEED_EVAL_GRAD) {
162         CeedBasis_Cuda_shared *basis_data;
163         CeedBasis              basis;
164 
165         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
166         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
167         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
168         was_grad_found = true;
169         CeedCallBackend(CeedBasisDestroy(&basis));
170       }
171     }
172   }
173   return CEED_ERROR_SUCCESS;
174 }
175 
176 //------------------------------------------------------------------------------
177 // Setup fields
178 //------------------------------------------------------------------------------
179 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
180                                                      CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt Q_1d, bool is_input,
181                                                      bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
182   bool      is_tensor = true;
183   CeedBasis basis;
184   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
185   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
186 
187   const char            *field_name;
188   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
189   std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
190   std::string            option_name = (is_input ? "inputs" : "outputs");
191   CeedEvalMode           eval_mode   = CEED_EVAL_NONE;
192   CeedInt                elem_size = 0, num_comp = 0, dim = 1, P_1d = 0;
193   CeedElemRestriction    elem_rstr;
194   CeedBasis_Cuda_shared *basis_data;
195 
196   // Field reuse info
197   bool use_previous_field = field_reuse.index != -1;
198 
199   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
200   code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
201 
202   // Get field data
203   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
204   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
205     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
206     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
207   }
208   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
209   if (basis != CEED_BASIS_NONE) {
210     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
211     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
212     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
213     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
214   }
215   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
216 
217   // Set field constants
218   code << "  const CeedInt dim" << var_suffix << " = " << dim << ";\n";
219   if (is_tensor && !is_all_tensor) {
220     CeedInt P = 0;
221 
222     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
223     code << "  const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P) << ";\n";
224   }
225   code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
226   if (eval_mode != CEED_EVAL_WEIGHT) {
227     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
228   }
229 
230   // Load basis data
231   code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
232   switch (eval_mode) {
233     case CEED_EVAL_NONE:
234       break;
235     case CEED_EVAL_INTERP:
236       if (is_at_points) {
237         // AtPoints
238         if (!basis_data->d_chebyshev_interp_1d) {
239           CeedSize    interp_bytes;
240           CeedScalar *chebyshev_interp_1d;
241 
242           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
243           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
244           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
245           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
246           CeedCallCuda(CeedBasisReturnCeed(basis),
247                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
248           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
249         }
250         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
251         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
252       } else {
253         // Standard quadrature
254         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
255         else data->B.outputs[i] = basis_data->d_interp_1d;
256       }
257       if (use_previous_field) {
258         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
259 
260         code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
261       } else {
262         code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
263         code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
264       }
265       break;
266     case CEED_EVAL_GRAD:
267       if (is_at_points) {
268         // AtPoints
269         if (!basis_data->d_chebyshev_interp_1d) {
270           CeedSize    interp_bytes;
271           CeedScalar *chebyshev_interp_1d;
272 
273           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
274           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
275           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
276           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
277           CeedCallCuda(CeedBasisReturnCeed(basis),
278                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
279           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
280         }
281         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
282         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
283       } else {
284         // Standard quadrature
285         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
286         else data->B.outputs[i] = basis_data->d_interp_1d;
287       }
288       if (is_tensor) {
289         if (use_previous_field) {
290           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
291 
292           code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
293         } else {
294           code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
295           code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
296         }
297       }
298       if (is_at_points) break;  // No G mat for AtPoints
299       if (use_3d_slices) {
300         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
301         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
302         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
303           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
304 
305           code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
306         } else {
307           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
308           code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
309         }
310       } else {
311         bool has_collo_grad = basis_data->d_collo_grad_1d;
312 
313         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
314         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
315         if (has_collo_grad) {
316           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
317             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
318 
319             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
320           } else {
321             code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
322             code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
323           }
324         } else {
325           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
326             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
327 
328             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
329           } else {
330             code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
331                  << (is_tensor ? "" : var_suffix) << "];\n";
332             code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
333                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
334           }
335         }
336       }
337       break;
338     case CEED_EVAL_WEIGHT:
339       break;  // No action
340       // LCOV_EXCL_START
341     case CEED_EVAL_DIV:
342     case CEED_EVAL_CURL:
343       break;  // TODO: Not implemented
344               // LCOV_EXCL_STOP
345   }
346   CeedCallBackend(CeedBasisDestroy(&basis));
347   return CEED_ERROR_SUCCESS;
348 }
349 
350 //------------------------------------------------------------------------------
351 // Restriction
352 //------------------------------------------------------------------------------
353 static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt max_dim,
354                                                        CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
355                                                        CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
356   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
357   std::string               P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
358   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
359   CeedInt                   elem_size = 0, num_comp = 0;
360   CeedSize                  l_size;
361   CeedRestrictionType       rstr_type = CEED_RESTRICTION_STANDARD;
362   CeedElemRestriction_Cuda *rstr_data;
363   CeedElemRestriction       elem_rstr;
364 
365   // Get field data
366   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
367   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
368     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
369     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
370     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
371     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
372   }
373   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
374 
375   // Restriction
376   if (is_input) {
377     // Input
378     if (field_input_buffer[i] != i) {
379       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
380 
381       // Restriction was already done for previous input
382       code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
383     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
384       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
385         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
386         code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
387       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
388         // Otherwise we're using the scratch space
389         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
390       }
391       switch (rstr_type) {
392         case CEED_RESTRICTION_STANDARD: {
393           CeedInt comp_stride;
394 
395           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
396           code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
397           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
398           code << "    // CompStride: " << comp_stride << "\n";
399           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
400           code << "    ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
401                << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
402           break;
403         }
404         case CEED_RESTRICTION_STRIDED: {
405           bool    has_backend_strides;
406           CeedInt num_elem;
407 
408           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
409           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
410           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
411 
412           if (!has_backend_strides) {
413             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
414           }
415           code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
416           code << "    ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
417                << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
418           break;
419         }
420         case CEED_RESTRICTION_POINTS: {
421           CeedInt comp_stride;
422 
423           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
424           code << "    const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
425           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
426           break;
427         }
428         // LCOV_EXCL_START
429         case CEED_RESTRICTION_ORIENTED:
430         case CEED_RESTRICTION_CURL_ORIENTED:
431           break;  // TODO: Not implemented
432                   // LCOV_EXCL_STOP
433       }
434     }
435   } else {
436     // Output
437     switch (rstr_type) {
438       case CEED_RESTRICTION_STANDARD: {
439         CeedInt comp_stride;
440 
441         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
442         code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
443         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
444         code << "    // CompStride: " << comp_stride << "\n";
445         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
446         code << "    WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
447              << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
448         break;
449       }
450       case CEED_RESTRICTION_STRIDED: {
451         bool    has_backend_strides;
452         CeedInt num_elem;
453 
454         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
455         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
456         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
457 
458         if (!has_backend_strides) {
459           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
460         }
461         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
462         code << "    WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
463              << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
464         break;
465       }
466       case CEED_RESTRICTION_POINTS:
467         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
468         break;
469       // LCOV_EXCL_START
470       case CEED_RESTRICTION_ORIENTED:
471       case CEED_RESTRICTION_CURL_ORIENTED:
472         break;  // TODO: Not implemented
473                 // LCOV_EXCL_STOP
474     }
475   }
476   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
477   return CEED_ERROR_SUCCESS;
478 }
479 
480 //------------------------------------------------------------------------------
481 // Basis
482 //------------------------------------------------------------------------------
483 static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt max_dim,
484                                                  CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
485                                                  bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
486   bool      is_tensor = true;
487   CeedBasis basis;
488   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
489   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
490 
491   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
492   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
493   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
494   CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
495   CeedElemRestriction elem_rstr;
496 
497   // Get field data
498   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
499   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
500     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
501     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
502   }
503   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
504   if (basis != CEED_BASIS_NONE) {
505     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
506     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
507     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
508   }
509   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
510 
511   // Basis
512   code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
513   if (is_input) {
514     switch (eval_mode) {
515       case CEED_EVAL_NONE:
516         if (!use_3d_slices && !is_at_points) {
517           code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
518         }
519         break;
520       case CEED_EVAL_INTERP:
521         if (is_at_points) {
522           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
523 
524           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
525           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
526                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
527         } else {
528           std::string function_name = is_tensor
529                                           ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
530                                           : "InterpNonTensor";
531 
532           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
533           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
534                << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
535         }
536         break;
537       case CEED_EVAL_GRAD:
538         if (is_at_points) {
539           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
540 
541           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
542           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
543                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
544         } else if (use_3d_slices) {
545           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
546 
547           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
548           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
549                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
550         } else if (is_tensor) {
551           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
552           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
553                                       (is_all_tensor ? "" : "Flattened");
554 
555           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" << (dim >= 3 ? Q_name : "1")
556                << "];\n";
557           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
558                << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
559         } else {
560           std::string function_name = "GradNonTensor";
561 
562           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
563           code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name << ", "
564                << (P_1d > Q_1d ? P_name : Q_name) << ">(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
565         }
566         break;
567       case CEED_EVAL_WEIGHT: {
568         if (is_at_points) {
569           code << "    // Nothing to do AtPoints\n";
570         } else {
571           CeedBasis_Cuda_shared *basis_data;
572           std::string            function_name = is_tensor
573                                                      ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
574                                                      : "WeightNonTensor";
575 
576           code << "    CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
577           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
578           data->W = basis_data->d_q_weight_1d;
579           code << "    " << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
580         }
581         break;
582       }
583       // LCOV_EXCL_START
584       case CEED_EVAL_DIV:
585       case CEED_EVAL_CURL:
586         break;  // TODO: Not implemented
587                 // LCOV_EXCL_STOP
588     }
589   } else {
590     switch (eval_mode) {
591       case CEED_EVAL_NONE:
592         code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
593         break;  // No action
594       case CEED_EVAL_INTERP:
595         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
596         if (is_at_points) {
597           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
598 
599           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
600                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
601         } else {
602           std::string function_name =
603               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
604                         : "InterpTransposeNonTensor";
605 
606           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
607                << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
608         }
609         break;
610       case CEED_EVAL_GRAD:
611         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
612         if (is_at_points) {
613           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
614 
615           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
616                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
617         } else if (use_3d_slices) {
618           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
619 
620           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
621                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
622         } else if (is_tensor) {
623           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
624           std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
625                                       std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
626 
627           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << (P_1d > Q_1d ? P_name : Q_name)
628                << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
629         } else {
630           std::string function_name = "GradTransposeNonTensor";
631 
632           code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name << ", "
633                << (P_1d > Q_1d ? P_name : Q_name) << ">(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
634         }
635         break;
636       // LCOV_EXCL_START
637       case CEED_EVAL_WEIGHT:
638         break;  // Should not occur
639       case CEED_EVAL_DIV:
640       case CEED_EVAL_CURL:
641         break;  // TODO: Not implemented
642                 // LCOV_EXCL_STOP
643     }
644   }
645   CeedCallBackend(CeedBasisDestroy(&basis));
646   return CEED_ERROR_SUCCESS;
647 }
648 
649 //------------------------------------------------------------------------------
650 // QFunction
651 //------------------------------------------------------------------------------
652 static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt max_dim, CeedInt max_num_points,
653                                                      CeedInt num_input_fields, CeedOperatorField *op_input_fields,
654                                                      CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
655                                                      CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
656                                                      std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
657                                                      bool use_3d_slices) {
658   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
659   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
660   CeedElemRestriction elem_rstr;
661 
662   // Setup output arrays
663   code << "\n    // -- Output field setup\n";
664   for (CeedInt i = 0; i < num_output_fields; i++) {
665     const char *field_name;
666     std::string var_suffix = "_out_" + std::to_string(i);
667 
668     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
669     code << "    // ---- Output field " << i << ": " << field_name << "\n";
670     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
671     switch (eval_mode) {
672       case CEED_EVAL_NONE:
673         if (is_at_points) {
674           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
675         } else {
676           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
677                << "];\n";
678         }
679         break;
680       case CEED_EVAL_INTERP:
681         if (is_at_points) {
682           // Accumulator for point data
683           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
684           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
685           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
686           code << "    }\n";
687         } else {
688           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
689                << "];\n";
690         }
691         break;
692       case CEED_EVAL_GRAD:
693         if (is_at_points) {
694           // Accumulator for point data
695           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "*dim" << var_suffix
696                << "];\n";
697           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) {\n";
698           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
699           code << "    }\n";
700         } else if (use_3d_slices) {
701           // Accumulator for gradient slices
702           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
703           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
704           code << "      r_q" << var_suffix << "[i] = 0.0;\n";
705           code << "    }\n";
706         } else {
707           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
708                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
709         }
710         break;
711       case CEED_EVAL_WEIGHT:
712         break;
713         // LCOV_EXCL_START
714       case CEED_EVAL_DIV:
715       case CEED_EVAL_CURL:
716         break;  // TODO: Not implemented
717                 // LCOV_EXCL_STOP
718     }
719   }
720 
721   if (is_at_points) {
722     // We need to handle batches of points
723     code << "\n    // Note: Using batches of points\n";
724     code << "    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));\n\n";
725     code << "    #pragma unroll\n";
726     code << "    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {\n";
727     code << "      const CeedInt p = i % max_num_points;\n\n";
728 
729     code << "      // -- Coordinates\n";
730     code << "      CeedScalar r_x[max_dim];\n";
731     code << "      ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
732 
733     code << "      // -- Input fields\n";
734     for (CeedInt i = 0; i < num_input_fields; i++) {
735       const char *field_name;
736       std::string var_suffix = "_in_" + std::to_string(i);
737       std::string P_name     = "P_1d" + var_suffix;
738 
739       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
740       code << "      // ---- Input field " << i << ": " << field_name << "\n";
741       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
742       // Basis action
743       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
744       switch (eval_mode) {
745         case CEED_EVAL_NONE:
746           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
747           code << "      ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
748                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
749           break;
750         case CEED_EVAL_INTERP:
751           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
752           code << "      InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
753                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
754           break;
755         case CEED_EVAL_GRAD:
756           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
757           code << "      GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
758                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
759           break;
760         case CEED_EVAL_WEIGHT:
761           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
762           code << "      r_s" << var_suffix << "[0] = 1.0;\n";
763           break;
764           // LCOV_EXCL_START
765         case CEED_EVAL_DIV:
766         case CEED_EVAL_CURL:
767           break;  // TODO: Not implemented
768                   // LCOV_EXCL_STOP
769       }
770     }
771     code << "\n      // -- Output fields\n";
772     for (CeedInt i = 0; i < num_output_fields; i++) {
773       const char *field_name;
774       std::string var_suffix = "_out_" + std::to_string(i);
775 
776       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
777       code << "      // ---- Output field " << i << ": " << field_name << "\n";
778       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
779       // Basis action
780       switch (eval_mode) {
781         case CEED_EVAL_NONE:
782           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
783           break;
784         case CEED_EVAL_INTERP:
785           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
786           break;
787         case CEED_EVAL_GRAD:
788           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
789           break;
790           // LCOV_EXCL_START
791         case CEED_EVAL_WEIGHT:
792           break;  // Should not occur
793         case CEED_EVAL_DIV:
794         case CEED_EVAL_CURL:
795           break;  // TODO: Not implemented
796                   // LCOV_EXCL_STOP
797       }
798     }
799 
800   } else if (use_3d_slices) {
801     // We treat quadrature points per slice in 3d to save registers
802     code << "\n    // Note: Using planes of 3D elements\n";
803     code << "    #pragma unroll\n";
804     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
805     code << "      // -- Input fields\n";
806     for (CeedInt i = 0; i < num_input_fields; i++) {
807       const char *field_name;
808       std::string var_suffix = "_in_" + std::to_string(i);
809 
810       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
811       code << "      // ---- Input field " << i << ": " << field_name << "\n";
812       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
813       // Basis action
814       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
815       switch (eval_mode) {
816         case CEED_EVAL_NONE:
817           bool is_strided;
818 
819           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
820 
821           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
822           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
823           if (is_strided) {
824             bool    has_backend_strides;
825             CeedInt num_elem, elem_size;
826 
827             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
828             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
829             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
830             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
831 
832             if (!has_backend_strides) {
833               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
834             }
835             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
836             code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", " << strides[0] << ", " << strides[1] << ", "
837                  << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
838           } else {
839             CeedSize                  l_size = 0;
840             CeedInt                   comp_stride;
841             CeedElemRestriction_Cuda *rstr_data;
842 
843             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
844             code << "      const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
845             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
846             code << "      // CompStride: " << comp_stride << "\n";
847             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
848             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
849             code << "      ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
850                  << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
851           }
852           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
853           break;
854         case CEED_EVAL_INTERP:
855           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
856           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
857           code << "        r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
858           code << "      }\n";
859           break;
860         case CEED_EVAL_GRAD:
861           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
862           code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
863                << var_suffix << ", r_s" << var_suffix << ");\n";
864           break;
865         case CEED_EVAL_WEIGHT:
866           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
867           code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
868           break;
869           // LCOV_EXCL_START
870         case CEED_EVAL_DIV:
871         case CEED_EVAL_CURL:
872           break;  // TODO: Not implemented
873                   // LCOV_EXCL_STOP
874       }
875     }
876     code << "\n      // -- Output fields\n";
877     for (CeedInt i = 0; i < num_output_fields; i++) {
878       const char *field_name;
879       std::string var_suffix = "_out_" + std::to_string(i);
880 
881       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
882       code << "      // ---- Output field " << i << ": " << field_name << "\n";
883       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
884       // Basis action
885       switch (eval_mode) {
886         case CEED_EVAL_NONE:
887           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
888           break;
889         case CEED_EVAL_INTERP:
890           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
891           break;
892         case CEED_EVAL_GRAD:
893           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
894           break;
895           // LCOV_EXCL_START
896         case CEED_EVAL_WEIGHT:
897           break;  // Should not occur
898         case CEED_EVAL_DIV:
899         case CEED_EVAL_CURL:
900           break;  // TODO: Not implemented
901                   // LCOV_EXCL_STOP
902       }
903     }
904   } else {
905     code << "\n    // Note: Using full elements\n";
906     code << "    {\n";
907     code << "      // -- Input fields\n";
908     for (CeedInt i = 0; i < num_input_fields; i++) {
909       const char *field_name;
910 
911       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
912       code << "      // ---- Input field " << i << ": " << field_name << "\n";
913       code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
914     }
915     code << "      // -- Output fields\n";
916     for (CeedInt i = 0; i < num_output_fields; i++) {
917       const char *field_name;
918 
919       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
920       code << "      // ---- Output field " << i << ": " << field_name << "\n";
921       code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
922     }
923   }
924 
925   // Input and output buffers
926   code << "\n      // -- QFunction inputs and outputs\n";
927   code << "      // ---- Inputs\n";
928   code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
929   for (CeedInt i = 0; i < num_input_fields; i++) {
930     const char *field_name;
931 
932     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
933     code << "      // ------ Input field " << i << ": " << field_name << "\n";
934     code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
935   }
936   code << "      // ---- Outputs\n";
937   code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
938   for (CeedInt i = 0; i < num_output_fields; i++) {
939     const char *field_name;
940 
941     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
942     code << "      // ------ Output field " << i << ": " << field_name << "\n";
943     code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
944   }
945 
946   // Apply QFunction
947   code << "\n      // -- Apply QFunction\n";
948   code << "      " << qfunction_name << "(ctx, ";
949   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
950     code << "1";
951   } else {
952     code << Q_name;
953   }
954   code << ", inputs, outputs);\n";
955 
956   if (is_at_points) {
957     // Map back to coefficients
958     code << "\n      // -- Output fields\n";
959     for (CeedInt i = 0; i < num_output_fields; i++) {
960       const char *field_name;
961       std::string var_suffix = "_out_" + std::to_string(i);
962       std::string P_name     = "P_1d" + var_suffix;
963 
964       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
965       code << "      // ---- Output field " << i << ": " << field_name << "\n";
966       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
967       // Basis action
968       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
969       switch (eval_mode) {
970         case CEED_EVAL_NONE: {
971           CeedInt             comp_stride;
972           CeedElemRestriction elem_rstr;
973 
974           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
975           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
976           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
977           code << "      const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
978           code << "      WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
979                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
980                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
981           break;
982         }
983         case CEED_EVAL_INTERP:
984           code << "      if (i >= points.num_per_elem[elem]) {\n";
985           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
986           code << "      }\n";
987           code << "      InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
988                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
989           break;
990         case CEED_EVAL_GRAD:
991           code << "      if (i >= points.num_per_elem[elem]) {\n";
992           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
993           code << "      }\n";
994           code << "      GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
995                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
996           break;
997           // LCOV_EXCL_START
998         case CEED_EVAL_WEIGHT:
999           break;  // Should not occur
1000         case CEED_EVAL_DIV:
1001         case CEED_EVAL_CURL:
1002           break;  // TODO: Not implemented
1003                   // LCOV_EXCL_STOP
1004       }
1005     }
1006   } else if (use_3d_slices) {
1007     // Copy or apply transpose grad, if needed
1008     code << "\n      // -- Output fields\n";
1009     for (CeedInt i = 0; i < num_output_fields; i++) {
1010       const char *field_name;
1011       std::string var_suffix = "_out_" + std::to_string(i);
1012       std::string P_name     = "P_1d" + var_suffix;
1013 
1014       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1015       code << "      // ---- Output field " << i << ": " << field_name << "\n";
1016       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1017       // Basis action
1018       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1019       switch (eval_mode) {
1020         case CEED_EVAL_NONE:
1021           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1022           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1023           code << "      }\n";
1024           break;
1025         case CEED_EVAL_INTERP:
1026           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1027           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1028           code << "      }\n";
1029           break;
1030         case CEED_EVAL_GRAD:
1031           code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1032                << var_suffix << ", r_q" << var_suffix << ");\n";
1033           break;
1034           // LCOV_EXCL_START
1035         case CEED_EVAL_WEIGHT:
1036           break;  // Should not occur
1037         case CEED_EVAL_DIV:
1038         case CEED_EVAL_CURL:
1039           break;  // TODO: Not implemented
1040                   // LCOV_EXCL_STOP
1041       }
1042     }
1043   }
1044   code << "    }\n";
1045   return CEED_ERROR_SUCCESS;
1046 }
1047 
1048 //------------------------------------------------------------------------------
1049 // Build single operator kernel
1050 //------------------------------------------------------------------------------
1051 extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
1052   bool                    is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
1053   Ceed                    ceed;
1054   CeedInt                 Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1055   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
1056   CeedQFunction_Cuda_gen *qf_data;
1057   CeedQFunction           qf;
1058   CeedOperatorField      *op_input_fields, *op_output_fields;
1059   CeedOperator_Cuda_gen  *data;
1060   std::ostringstream      code;
1061 
1062   CeedCallBackend(CeedOperatorGetData(op, &data));
1063   {
1064     bool is_setup_done;
1065 
1066     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
1067     if (is_setup_done) {
1068       *is_good_build = !data->use_fallback;
1069       return CEED_ERROR_SUCCESS;
1070     }
1071   }
1072 
1073   // Check field compatibility
1074   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1075   {
1076     bool has_shared_bases = true;
1077 
1078     for (CeedInt i = 0; i < num_input_fields; i++) {
1079       CeedBasis basis;
1080 
1081       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1082       if (basis != CEED_BASIS_NONE) {
1083         bool        is_tensor = true;
1084         const char *resource;
1085         char       *resource_root;
1086         Ceed        basis_ceed;
1087 
1088         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1089         is_all_tensor    = is_all_tensor && is_tensor;
1090         is_all_nontensor = is_all_nontensor && !is_tensor;
1091         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1092         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1093         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1094         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
1095         CeedCallBackend(CeedFree(&resource_root));
1096         CeedCallBackend(CeedDestroy(&basis_ceed));
1097       }
1098       CeedCallBackend(CeedBasisDestroy(&basis));
1099     }
1100 
1101     for (CeedInt i = 0; i < num_output_fields; i++) {
1102       CeedBasis basis;
1103 
1104       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1105       if (basis != CEED_BASIS_NONE) {
1106         bool        is_tensor = true;
1107         const char *resource;
1108         char       *resource_root;
1109         Ceed        basis_ceed;
1110 
1111         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1112         is_all_tensor    = is_all_tensor && is_tensor;
1113         is_all_nontensor = is_all_nontensor && !is_tensor;
1114 
1115         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1116         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1117         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1118         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
1119         CeedCallBackend(CeedFree(&resource_root));
1120         CeedCallBackend(CeedDestroy(&basis_ceed));
1121       }
1122       CeedCallBackend(CeedBasisDestroy(&basis));
1123     }
1124     // -- Fallback to ref if not all bases are shared
1125     if (!has_shared_bases) {
1126       *is_good_build = false;
1127       return CEED_ERROR_SUCCESS;
1128     }
1129   }
1130   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1131   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1132   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1133   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1134 
1135   // Get operator data
1136   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1137   {
1138     CeedInt max_P, max_P_1d;
1139 
1140     CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
1141                                                          op_output_fields, qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor,
1142                                                          &use_3d_slices));
1143     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
1144   }
1145   if (max_dim == 0) max_dim = 1;
1146   data->dim = max_dim;
1147   if (is_at_points) {
1148     CeedElemRestriction_Cuda *rstr_data;
1149     CeedElemRestriction       rstr_points = NULL;
1150 
1151     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1152     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1153     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1154     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1155     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1156     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1157   }
1158   if (is_at_points) use_3d_slices = false;
1159   if (Q_1d == 0) {
1160     if (is_at_points) Q_1d = max_num_points;
1161     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1162   }
1163   data->Q_1d = Q_1d;
1164 
1165   // Check for restriction only identity operator
1166   {
1167     bool is_identity_qf;
1168 
1169     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1170     if (is_identity_qf) {
1171       CeedEvalMode eval_mode_in, eval_mode_out;
1172 
1173       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1174       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1175       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1176                 "Backend does not implement restriction only identity operators");
1177     }
1178   }
1179 
1180   // Add atomicAdd function for old NVidia architectures
1181   {
1182     Ceed_Cuda            *ceed_data;
1183     struct cudaDeviceProp prop;
1184 
1185     CeedCallBackend(CeedGetData(ceed, &ceed_data));
1186     CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
1187     if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
1188       code << "// AtomicAdd fallback source\n";
1189       code << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
1190     }
1191   }
1192 
1193   // Load basis source files
1194   if (!is_all_nontensor) {
1195     code << "// Tensor basis source\n";
1196     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
1197   }
1198   if (!is_all_tensor) {
1199     code << "// Non-tensor basis source\n";
1200     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
1201   }
1202   if (is_at_points) {
1203     code << "// AtPoints basis source\n";
1204     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
1205   }
1206   code << "// CodeGen operator source\n";
1207   code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
1208 
1209   // Get QFunction name
1210   std::string qfunction_name(qf_data->qfunction_name);
1211   std::string operator_name;
1212 
1213   operator_name = "CeedKernelCudaGenOperator_" + qfunction_name;
1214 
1215   // Define CEED_Q_VLA
1216   code << "\n#undef CEED_Q_VLA\n";
1217   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1218     code << "#define CEED_Q_VLA 1\n\n";
1219   } else {
1220     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1221   }
1222 
1223   // Add user QFunction source
1224   {
1225     const char *source_path;
1226 
1227     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1228     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
1229 
1230     code << "// User QFunction source\n";
1231     code << "#include \"" << source_path << "\"\n\n";
1232   }
1233 
1234   // Setup
1235   code << "\n// -----------------------------------------------------------------------------\n";
1236   code << "// Operator Kernel\n";
1237   code << "// \n";
1238   code << "// d_[in,out]_i:   CeedVector device array\n";
1239   code << "// r_[in,out]_e_i: Element vector register\n";
1240   code << "// r_[in,out]_q_i: Quadrature space vector register\n";
1241   code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1242   code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1243   code << "// \n";
1244   code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1245   code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1246   code << "// -----------------------------------------------------------------------------\n";
1247   code << "extern \"C\" __global__ void " << operator_name
1248        << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
1249           "points) {\n";
1250 
1251   // Scratch buffers
1252   for (CeedInt i = 0; i < num_input_fields; i++) {
1253     CeedEvalMode eval_mode;
1254 
1255     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1256     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1257       code << "  const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1258     }
1259   }
1260   for (CeedInt i = 0; i < num_output_fields; i++) {
1261     code << "  CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1262   }
1263 
1264   code << "  const CeedInt max_dim = " << max_dim << ";\n";
1265   if (!is_all_tensor) {
1266     code << "  const CeedInt Q = " << Q << ";\n";
1267   }
1268   if (!is_all_nontensor) {
1269     code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
1270   }
1271   if (is_at_points) {
1272     code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
1273     code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1274   }
1275 
1276   // Shared data
1277   code << "  extern __shared__ CeedScalar slice[];\n";
1278   code << "  SharedData_Cuda data;\n";
1279   code << "  data.t_id_x = threadIdx.x;\n";
1280   code << "  data.t_id_y = threadIdx.y;\n";
1281   code << "  data.t_id_z = threadIdx.z;\n";
1282   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1283   code << "  data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1284 
1285   // -- Determine input mat reuse
1286   FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
1287 
1288   for (CeedInt i = 0; i < num_input_fields; i++) {
1289     input_matrix_reuse[i].index = -1;
1290   }
1291   for (CeedInt i = 0; i < num_input_fields; i++) {
1292     bool         is_tensor = true;
1293     CeedEvalMode eval_mode_i;
1294     CeedBasis    basis_i;
1295 
1296     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1297     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1298     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1299     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1300     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1301       CeedEvalMode eval_mode_j;
1302       CeedBasis    basis_j;
1303 
1304       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1305       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1306       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1307       if (basis_i == basis_j) {
1308         if (is_tensor) {
1309           input_matrix_reuse[i].index     = j;
1310           input_matrix_reuse[i].is_input  = true;
1311           input_matrix_reuse[i].eval_mode = eval_mode_j;
1312         } else {
1313           // For non-tensor can only re-use with the same eval mode
1314           if (eval_mode_i == eval_mode_j) {
1315             input_matrix_reuse[i].index     = j;
1316             input_matrix_reuse[i].is_input  = true;
1317             input_matrix_reuse[i].eval_mode = eval_mode_j;
1318           }
1319         }
1320       }
1321       CeedCallBackend(CeedBasisDestroy(&basis_j));
1322     }
1323     CeedCallBackend(CeedBasisDestroy(&basis_i));
1324   }
1325 
1326   // -- Determine output mat reuse
1327   FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX];
1328 
1329   for (CeedInt i = 0; i < num_output_fields; i++) {
1330     output_matrix_reuse[i].index = -1;
1331   }
1332   for (CeedInt i = 0; i < num_output_fields; i++) {
1333     bool         is_tensor = true;
1334     CeedEvalMode eval_mode_i;
1335     CeedBasis    basis_i;
1336 
1337     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1338     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1339     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1340     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1341       CeedEvalMode eval_mode_j;
1342       CeedBasis    basis_j;
1343 
1344       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1345       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1346       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1347       if (basis_i == basis_j) {
1348         if (is_tensor) {
1349           output_matrix_reuse[i].index     = j;
1350           output_matrix_reuse[i].is_input  = true;
1351           output_matrix_reuse[i].eval_mode = eval_mode_j;
1352         } else {
1353           // For non-tensor can only re-use with the same eval mode
1354           if (eval_mode_i == eval_mode_j) {
1355             output_matrix_reuse[i].index     = j;
1356             output_matrix_reuse[i].is_input  = true;
1357             output_matrix_reuse[i].eval_mode = eval_mode_j;
1358           }
1359         }
1360       }
1361       CeedCallBackend(CeedBasisDestroy(&basis_j));
1362     }
1363     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1364       CeedEvalMode eval_mode_j;
1365       CeedBasis    basis_j;
1366 
1367       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1368       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1369       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1370       if (basis_i == basis_j) {
1371         if (is_tensor) {
1372           output_matrix_reuse[i].index     = j;
1373           output_matrix_reuse[i].is_input  = false;
1374           output_matrix_reuse[i].eval_mode = eval_mode_j;
1375         } else {
1376           // For non-tensor can only re-use with the same eval mode
1377           if (eval_mode_i == eval_mode_j) {
1378             output_matrix_reuse[i].index     = j;
1379             output_matrix_reuse[i].is_input  = false;
1380             output_matrix_reuse[i].eval_mode = eval_mode_j;
1381           }
1382         }
1383       }
1384       CeedCallBackend(CeedBasisDestroy(&basis_j));
1385     }
1386     CeedCallBackend(CeedBasisDestroy(&basis_i));
1387   }
1388 
1389   // Initialize constants, and matrices B and G
1390   code << "\n  // Input field constants and basis data\n";
1391   for (CeedInt i = 0; i < num_input_fields; i++) {
1392     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d,
1393                                                               true, is_all_tensor, is_at_points, use_3d_slices));
1394   }
1395   code << "\n  // Output field constants and basis data\n";
1396   for (CeedInt i = 0; i < num_output_fields; i++) {
1397     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
1398                                                               false, is_all_tensor, is_at_points, use_3d_slices));
1399   }
1400 
1401   // Loop over all elements
1402   code << "\n  // Element loop\n";
1403   code << "  __syncthreads();\n";
1404   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1405 
1406   // -- Compute minimum buffer space needed
1407   CeedInt max_rstr_buffer_size = 1;
1408 
1409   for (CeedInt i = 0; i < num_input_fields; i++) {
1410     CeedInt             num_comp, elem_size;
1411     CeedElemRestriction elem_rstr;
1412 
1413     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1414     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1415     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1416     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? elem_size : 1));
1417     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1418   }
1419   for (CeedInt i = 0; i < num_output_fields; i++) {
1420     CeedInt             num_comp, elem_size;
1421     CeedElemRestriction elem_rstr;
1422 
1423     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1424     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1425     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1426     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? elem_size : 1));
1427     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1428   }
1429   code << "    // Scratch restriction buffer space\n";
1430   code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1431 
1432   // -- Determine best input field processing order
1433   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1434 
1435   for (CeedInt i = 0; i < num_input_fields; i++) {
1436     field_rstr_in_buffer[i] = -1;
1437     input_field_order[i]    = -1;
1438   }
1439   {
1440     bool    is_ordered[CEED_FIELD_MAX];
1441     CeedInt curr_index = 0;
1442 
1443     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1444     for (CeedInt i = 0; i < num_input_fields; i++) {
1445       CeedVector          vec_i;
1446       CeedElemRestriction rstr_i;
1447 
1448       if (is_ordered[i]) continue;
1449       field_rstr_in_buffer[i]       = i;
1450       is_ordered[i]                 = true;
1451       input_field_order[curr_index] = i;
1452       curr_index++;
1453       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1454       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1455       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1456       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1457         CeedVector          vec_j;
1458         CeedElemRestriction rstr_j;
1459 
1460         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1461         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1462         if (rstr_i == rstr_j && vec_i == vec_j) {
1463           field_rstr_in_buffer[j]       = i;
1464           is_ordered[j]                 = true;
1465           input_field_order[curr_index] = j;
1466           curr_index++;
1467         }
1468         CeedCallBackend(CeedVectorDestroy(&vec_j));
1469         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1470       }
1471       CeedCallBackend(CeedVectorDestroy(&vec_i));
1472       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1473     }
1474   }
1475 
1476   // -- Input restriction and basis
1477   code << "\n    // -- Input field restrictions and basis actions\n";
1478   for (CeedInt i = 0; i < num_input_fields; i++) {
1479     const char   *field_name;
1480     const CeedInt f = input_field_order[i];
1481 
1482     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1483     code << "    // ---- Input field " << f << ": " << field_name << "\n";
1484 
1485     // ---- Restriction
1486     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, max_dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1487                                                                 Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1488 
1489     // ---- Basis action
1490     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, max_dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_all_tensor,
1491                                                           is_at_points, use_3d_slices));
1492   }
1493 
1494   // -- Q function
1495   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, max_dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
1496                                                             num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d,
1497                                                             is_all_tensor, is_at_points, use_3d_slices));
1498 
1499   // -- Output basis and restriction
1500   code << "\n    // -- Output field basis action and restrictions\n";
1501   for (CeedInt i = 0; i < num_output_fields; i++) {
1502     const char *field_name;
1503 
1504     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1505     code << "    // ---- Output field " << i << ": " << field_name << "\n";
1506 
1507     // ---- Basis action
1508     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, max_dim, op_output_fields[i], qf_output_fields[i], Q_1d, false,
1509                                                           is_all_tensor, is_at_points, use_3d_slices));
1510 
1511     // ---- Restriction
1512     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, max_dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
1513                                                                 is_all_tensor, is_at_points, use_3d_slices));
1514   }
1515 
1516   // Close loop and function
1517   code << "  }\n";
1518   code << "}\n";
1519   code << "// -----------------------------------------------------------------------------\n\n";
1520 
1521   // Compile
1522   {
1523     bool          is_compile_good = false;
1524     const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
1525 
1526     CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
1527     if (is_compile_good) {
1528       *is_good_build = true;
1529       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
1530     } else {
1531       *is_good_build     = false;
1532       data->use_fallback = true;
1533     }
1534   }
1535   CeedCallBackend(CeedOperatorSetSetupDone(op));
1536   CeedCallBackend(CeedDestroy(&ceed));
1537   CeedCallBackend(CeedQFunctionDestroy(&qf));
1538   return CEED_ERROR_SUCCESS;
1539 }
1540 
1541 //------------------------------------------------------------------------------
1542