xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision b46df0d23d416892813aae9c232a5a88657bbf88)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/jit-tools.h>
13 #include <cuda_runtime.h>
14 
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 
19 #include "../cuda-ref/ceed-cuda-ref.h"
20 #include "../cuda-shared/ceed-cuda-shared.h"
21 #include "../cuda/ceed-cuda-common.h"
22 #include "../cuda/ceed-cuda-compile.h"
23 #include "ceed-cuda-gen.h"
24 
25 struct FieldReuse_Cuda {
26   CeedInt      index;
27   bool         is_input;
28   CeedEvalMode eval_mode;
29 };
30 
31 //------------------------------------------------------------------------------
32 // Determine type of operator
33 //------------------------------------------------------------------------------
34 static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
35                                                 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
36                                                 CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
37                                                 CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
38   // Check if all are tensor
39   *is_all_tensor = true;
40   for (CeedInt i = 0; i < num_input_fields; i++) {
41     CeedBasis basis;
42 
43     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
44     if (basis != CEED_BASIS_NONE) {
45       bool is_field_tensor;
46 
47       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
48       *is_all_tensor = *is_all_tensor && is_field_tensor;
49     }
50     CeedCallBackend(CeedBasisDestroy(&basis));
51   }
52   for (CeedInt i = 0; i < num_output_fields; i++) {
53     CeedBasis basis;
54 
55     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
56     if (basis != CEED_BASIS_NONE) {
57       bool is_field_tensor;
58 
59       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
60       *is_all_tensor = *is_all_tensor && is_field_tensor;
61     }
62     CeedCallBackend(CeedBasisDestroy(&basis));
63   }
64 
65   // Find max_P, max_P_1d, Q, and Q_1d
66   bool is_all_3d = true;
67 
68   *max_P    = 0;
69   *max_P_1d = 0;
70   *Q        = 0;
71   *Q_1d     = 0;
72   for (CeedInt i = 0; i < num_input_fields; i++) {
73     CeedBasis basis;
74 
75     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
76     if (basis != CEED_BASIS_NONE) {
77       bool    is_field_tensor;
78       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
79 
80       // Check if 3D
81       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
82       is_all_3d = is_all_3d && (field_dim == 3);
83       *max_dim  = CeedIntMax(*max_dim, field_dim);
84 
85       // Collect P, P_1d, Q, and Q_1d
86       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
87       *max_P = CeedIntMax(*max_P, field_P);
88       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
89       if (is_field_tensor) {
90         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
91         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
92       }
93       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
94       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
95       *Q = field_Q;
96       if (is_field_tensor) {
97         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
98         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
99         *Q_1d = field_Q_1d;
100       }
101     }
102     CeedCallBackend(CeedBasisDestroy(&basis));
103   }
104   for (CeedInt i = 0; i < num_output_fields; i++) {
105     CeedBasis basis;
106 
107     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
108     if (basis != CEED_BASIS_NONE) {
109       bool    is_field_tensor;
110       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
111 
112       // Check if 3D
113       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
114       is_all_3d = is_all_3d && (field_dim == 3);
115       *max_dim  = CeedIntMax(*max_dim, field_dim);
116 
117       // Collect P, P_1d, Q, and Q_1d
118       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
119       *max_P = CeedIntMax(*max_P, field_P);
120       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
121       if (is_field_tensor) {
122         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
123         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
124       }
125       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
126       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
127       *Q = field_Q;
128       if (is_field_tensor) {
129         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
130         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
131         *Q_1d = field_Q_1d;
132       }
133     }
134     CeedCallBackend(CeedBasisDestroy(&basis));
135   }
136 
137   // Only use 3D collocated gradient parallelization strategy when gradient is computed
138   *use_3d_slices = false;
139   if (is_all_3d && *is_all_tensor) {
140     bool was_grad_found = false;
141 
142     for (CeedInt i = 0; i < num_input_fields; i++) {
143       CeedEvalMode eval_mode;
144 
145       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
146       if (eval_mode == CEED_EVAL_GRAD) {
147         CeedBasis_Cuda_shared *basis_data;
148         CeedBasis              basis;
149 
150         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
151         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
152         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
153         was_grad_found = true;
154         CeedCallBackend(CeedBasisDestroy(&basis));
155       }
156     }
157     for (CeedInt i = 0; i < num_output_fields; i++) {
158       CeedEvalMode eval_mode;
159 
160       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
161       if (eval_mode == CEED_EVAL_GRAD) {
162         CeedBasis_Cuda_shared *basis_data;
163         CeedBasis              basis;
164 
165         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
166         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
167         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
168         was_grad_found = true;
169         CeedCallBackend(CeedBasisDestroy(&basis));
170       }
171     }
172   }
173   return CEED_ERROR_SUCCESS;
174 }
175 
176 //------------------------------------------------------------------------------
177 // Setup fields
178 //------------------------------------------------------------------------------
179 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
180                                                      CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, CeedInt max_dim, CeedInt Q,
181                                                      CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
182   bool      is_tensor = true;
183   CeedBasis basis;
184   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
185   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
186 
187   const char            *field_name;
188   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
189   std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
190   std::string            option_name = (is_input ? "inputs" : "outputs");
191   CeedEvalMode           eval_mode   = CEED_EVAL_NONE;
192   CeedInt                elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
193   CeedElemRestriction    elem_rstr;
194   CeedBasis_Cuda_shared *basis_data;
195 
196   // Field reuse info
197   bool use_previous_field = field_reuse.index != -1;
198 
199   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
200   code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
201 
202   // Get field data
203   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
204   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
205     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
206     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
207   }
208   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
209   if (basis != CEED_BASIS_NONE) {
210     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
211     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
212     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
213     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
214   }
215   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
216 
217   // Set field constants
218   code << "  const CeedInt dim" << var_suffix << " = " << dim << ";\n";
219   if (is_tensor && !is_all_tensor) {
220     CeedInt P = 0;
221 
222     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
223     code << "  const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
224   }
225   code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
226   if (eval_mode != CEED_EVAL_WEIGHT) {
227     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
228   }
229 
230   // Load basis data
231   code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
232   switch (eval_mode) {
233     case CEED_EVAL_NONE:
234       break;
235     case CEED_EVAL_INTERP:
236       if (is_at_points) {
237         // AtPoints
238         if (!basis_data->d_chebyshev_interp_1d) {
239           CeedSize    interp_bytes;
240           CeedScalar *chebyshev_interp_1d;
241 
242           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
243           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
244           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
245           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
246           CeedCallCuda(CeedBasisReturnCeed(basis),
247                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
248           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
249         }
250         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
251         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
252       } else {
253         // Standard quadrature
254         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
255         else data->B.outputs[i] = basis_data->d_interp_1d;
256       }
257       if (use_previous_field) {
258         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
259 
260         code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
261       } else {
262         code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
263         code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
264       }
265       break;
266     case CEED_EVAL_GRAD:
267       if (is_at_points) {
268         // AtPoints
269         if (!basis_data->d_chebyshev_interp_1d) {
270           CeedSize    interp_bytes;
271           CeedScalar *chebyshev_interp_1d;
272 
273           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
274           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
275           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
276           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
277           CeedCallCuda(CeedBasisReturnCeed(basis),
278                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
279           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
280         }
281         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
282         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
283       } else {
284         // Standard quadrature
285         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
286         else data->B.outputs[i] = basis_data->d_interp_1d;
287       }
288       if (is_tensor) {
289         if (use_previous_field) {
290           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
291 
292           code << "  CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
293         } else {
294           code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
295           code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
296         }
297       }
298       if (is_at_points) break;  // No G mat for AtPoints
299       if (use_3d_slices) {
300         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
301         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
302         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
303           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
304 
305           code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
306         } else {
307           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
308           code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
309         }
310       } else {
311         bool has_collo_grad = basis_data->d_collo_grad_1d;
312 
313         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
314         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
315         if (has_collo_grad) {
316           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
317             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
318 
319             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
320           } else {
321             code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
322             code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
323           }
324         } else {
325           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
326             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
327 
328             code << "  CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
329           } else {
330             code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
331                  << (is_tensor ? "" : var_suffix) << "];\n";
332             code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
333                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
334           }
335         }
336       }
337       break;
338     case CEED_EVAL_WEIGHT:
339       break;  // No action
340       // LCOV_EXCL_START
341     case CEED_EVAL_DIV:
342     case CEED_EVAL_CURL:
343       break;  // TODO: Not implemented
344               // LCOV_EXCL_STOP
345   }
346   CeedCallBackend(CeedBasisDestroy(&basis));
347   return CEED_ERROR_SUCCESS;
348 }
349 
350 //------------------------------------------------------------------------------
351 // Restriction
352 //------------------------------------------------------------------------------
353 static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt field_input_buffer[],
354                                                        CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d,
355                                                        bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) {
356   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
357   std::string               P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
358   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
359   CeedInt                   elem_size = 0, num_comp = 0;
360   CeedSize                  l_size;
361   CeedRestrictionType       rstr_type = CEED_RESTRICTION_STANDARD;
362   CeedElemRestriction_Cuda *rstr_data;
363   CeedElemRestriction       elem_rstr;
364 
365   // Get field data
366   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
367   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
368     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
369     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
370     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
371     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
372   }
373   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
374 
375   // Restriction
376   if (is_input) {
377     // Input
378     if (field_input_buffer[i] != i) {
379       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
380 
381       // Restriction was already done for previous input
382       code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
383     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
384       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
385         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
386         code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
387       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
388         // Otherwise we're using the scratch space
389         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
390       }
391       switch (rstr_type) {
392         case CEED_RESTRICTION_STANDARD: {
393           CeedInt comp_stride;
394 
395           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
396           code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
397           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
398           code << "    // CompStride: " << comp_stride << "\n";
399           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
400           code << "    ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
401                << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
402           break;
403         }
404         case CEED_RESTRICTION_STRIDED: {
405           bool    has_backend_strides;
406           CeedInt num_elem;
407 
408           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
409           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
410           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
411 
412           if (!has_backend_strides) {
413             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
414           }
415           code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
416           code << "    ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
417                << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
418           break;
419         }
420         case CEED_RESTRICTION_POINTS: {
421           CeedInt comp_stride;
422 
423           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
424           code << "    const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
425           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
426           break;
427         }
428         // LCOV_EXCL_START
429         case CEED_RESTRICTION_ORIENTED:
430         case CEED_RESTRICTION_CURL_ORIENTED:
431           break;  // TODO: Not implemented
432                   // LCOV_EXCL_STOP
433       }
434     }
435   } else {
436     // Output
437     switch (rstr_type) {
438       case CEED_RESTRICTION_STANDARD: {
439         CeedInt comp_stride;
440 
441         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
442         code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
443         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
444         code << "    // CompStride: " << comp_stride << "\n";
445         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
446         code << "    WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
447              << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
448         break;
449       }
450       case CEED_RESTRICTION_STRIDED: {
451         bool    has_backend_strides;
452         CeedInt num_elem;
453 
454         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
455         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
456         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
457 
458         if (!has_backend_strides) {
459           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
460         }
461         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
462         code << "    WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
463              << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
464         break;
465       }
466       case CEED_RESTRICTION_POINTS:
467         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
468         break;
469       // LCOV_EXCL_START
470       case CEED_RESTRICTION_ORIENTED:
471       case CEED_RESTRICTION_CURL_ORIENTED:
472         break;  // TODO: Not implemented
473                 // LCOV_EXCL_STOP
474     }
475   }
476   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
477   return CEED_ERROR_SUCCESS;
478 }
479 
480 //------------------------------------------------------------------------------
481 // Basis
482 //------------------------------------------------------------------------------
483 static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
484                                                  CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
485                                                  bool is_at_points, bool use_3d_slices) {
486   bool      is_tensor = true;
487   CeedBasis basis;
488   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
489   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
490 
491   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
492   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
493   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
494   CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
495   CeedElemRestriction elem_rstr;
496 
497   // Get field data
498   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
499   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
500     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
501     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
502   }
503   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
504   if (basis != CEED_BASIS_NONE) {
505     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
506     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
507     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
508   }
509   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
510 
511   // Basis
512   code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
513   if (is_input) {
514     switch (eval_mode) {
515       case CEED_EVAL_NONE:
516         if (!use_3d_slices && !is_at_points) {
517           code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
518         }
519         break;
520       case CEED_EVAL_INTERP:
521         if (is_at_points) {
522           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
523 
524           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
525           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
526                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
527         } else {
528           std::string function_name = is_tensor
529                                           ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
530                                           : "InterpNonTensor";
531           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
532 
533           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
534           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
535                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
536         }
537         break;
538       case CEED_EVAL_GRAD:
539         if (is_at_points) {
540           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
541 
542           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
543           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
544                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
545         } else if (use_3d_slices) {
546           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
547 
548           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
549           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
550                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
551         } else if (is_tensor) {
552           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
553           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
554                                       (is_all_tensor ? "" : "Flattened");
555           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
556 
557           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
558                << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
559           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
560                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
561         } else {
562           std::string function_name = "GradNonTensor";
563 
564           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
565           code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
566                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
567         }
568         break;
569       case CEED_EVAL_WEIGHT: {
570         if (is_at_points) {
571           code << "    // Nothing to do AtPoints\n";
572         } else {
573           CeedBasis_Cuda_shared *basis_data;
574           std::string            function_name = is_tensor
575                                                      ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
576                                                      : "WeightNonTensor";
577 
578           code << "    CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
579           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
580           data->W = basis_data->d_q_weight_1d;
581           code << "    " << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
582         }
583         break;
584       }
585       // LCOV_EXCL_START
586       case CEED_EVAL_DIV:
587       case CEED_EVAL_CURL:
588         break;  // TODO: Not implemented
589                 // LCOV_EXCL_STOP
590     }
591   } else {
592     switch (eval_mode) {
593       case CEED_EVAL_NONE:
594         code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
595         break;  // No action
596       case CEED_EVAL_INTERP:
597         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
598         if (is_at_points) {
599           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
600 
601           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
602                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
603         } else {
604           std::string function_name =
605               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
606                         : "InterpTransposeNonTensor";
607           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
608 
609           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
610                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
611         }
612         break;
613       case CEED_EVAL_GRAD:
614         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
615         if (is_at_points) {
616           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
617 
618           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
619                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
620         } else if (use_3d_slices) {
621           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
622 
623           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
624                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
625         } else if (is_tensor) {
626           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
627           std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
628                                       std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
629           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
630 
631           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
632                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
633         } else {
634           std::string function_name = "GradTransposeNonTensor";
635 
636           code << "    " << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
637                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
638         }
639         break;
640       // LCOV_EXCL_START
641       case CEED_EVAL_WEIGHT:
642         break;  // Should not occur
643       case CEED_EVAL_DIV:
644       case CEED_EVAL_CURL:
645         break;  // TODO: Not implemented
646                 // LCOV_EXCL_STOP
647     }
648   }
649   CeedCallBackend(CeedBasisDestroy(&basis));
650   return CEED_ERROR_SUCCESS;
651 }
652 
653 //------------------------------------------------------------------------------
654 // QFunction
655 //------------------------------------------------------------------------------
656 static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt max_dim, CeedInt max_num_points,
657                                                      CeedInt num_input_fields, CeedOperatorField *op_input_fields,
658                                                      CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
659                                                      CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
660                                                      std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
661                                                      bool use_3d_slices) {
662   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
663   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
664   CeedElemRestriction elem_rstr;
665 
666   // Setup output arrays
667   code << "\n    // -- Output field setup\n";
668   for (CeedInt i = 0; i < num_output_fields; i++) {
669     const char *field_name;
670     std::string var_suffix = "_out_" + std::to_string(i);
671 
672     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
673     code << "    // ---- Output field " << i << ": " << field_name << "\n";
674     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
675     switch (eval_mode) {
676       case CEED_EVAL_NONE:
677         if (is_at_points) {
678           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
679         } else {
680           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
681                << "];\n";
682         }
683         break;
684       case CEED_EVAL_INTERP:
685         if (is_at_points) {
686           // Accumulator for point data
687           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
688           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
689                << "[i] = 0.0;\n";
690         } else {
691           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
692                << "];\n";
693         }
694         break;
695       case CEED_EVAL_GRAD:
696         if (is_at_points) {
697           // Accumulator for point data
698           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
699           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
700                << "[i] = 0.0;\n";
701         } else if (use_3d_slices) {
702           // Accumulator for gradient slices
703           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
704           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
705         } else {
706           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
707                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
708         }
709         break;
710       case CEED_EVAL_WEIGHT:
711         break;
712         // LCOV_EXCL_START
713       case CEED_EVAL_DIV:
714       case CEED_EVAL_CURL:
715         break;  // TODO: Not implemented
716                 // LCOV_EXCL_STOP
717     }
718   }
719 
720   if (is_at_points) {
721     // We need to handle batches of points
722     code << "\n    // Note: Using batches of points\n";
723     code << "    const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
724     code << "    #pragma unroll\n";
725     code << "    for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
726     code << "      const CeedInt p = i % max_num_points;\n\n";
727 
728     code << "      // -- Coordinates\n";
729     code << "      CeedScalar r_x[max_dim];\n";
730     code << "      ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
731 
732     code << "      // -- Input fields\n";
733     for (CeedInt i = 0; i < num_input_fields; i++) {
734       const char *field_name;
735       std::string var_suffix = "_in_" + std::to_string(i);
736       std::string P_name     = "P_1d" + var_suffix;
737 
738       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
739       code << "      // ---- Input field " << i << ": " << field_name << "\n";
740       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
741       // Basis action
742       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
743       switch (eval_mode) {
744         case CEED_EVAL_NONE:
745           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
746           code << "      ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
747                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
748           break;
749         case CEED_EVAL_INTERP:
750           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
751           code << "      InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
752                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
753           break;
754         case CEED_EVAL_GRAD:
755           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
756           code << "      GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
757                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
758           break;
759         case CEED_EVAL_WEIGHT:
760           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
761           code << "      r_s" << var_suffix << "[0] = 1.0;\n";
762           break;
763           // LCOV_EXCL_START
764         case CEED_EVAL_DIV:
765         case CEED_EVAL_CURL:
766           break;  // TODO: Not implemented
767                   // LCOV_EXCL_STOP
768       }
769     }
770     code << "\n      // -- Output fields\n";
771     for (CeedInt i = 0; i < num_output_fields; i++) {
772       const char *field_name;
773       std::string var_suffix = "_out_" + std::to_string(i);
774 
775       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
776       code << "      // ---- Output field " << i << ": " << field_name << "\n";
777       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
778       // Basis action
779       switch (eval_mode) {
780         case CEED_EVAL_NONE:
781           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
782           break;
783         case CEED_EVAL_INTERP:
784           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
785           break;
786         case CEED_EVAL_GRAD:
787           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
788           break;
789           // LCOV_EXCL_START
790         case CEED_EVAL_WEIGHT:
791           break;  // Should not occur
792         case CEED_EVAL_DIV:
793         case CEED_EVAL_CURL:
794           break;  // TODO: Not implemented
795                   // LCOV_EXCL_STOP
796       }
797     }
798 
799   } else if (use_3d_slices) {
800     // We treat quadrature points per slice in 3d to save registers
801     code << "\n    // Note: Using planes of 3D elements\n";
802     code << "    #pragma unroll\n";
803     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
804     code << "      // -- Input fields\n";
805     for (CeedInt i = 0; i < num_input_fields; i++) {
806       const char *field_name;
807       std::string var_suffix = "_in_" + std::to_string(i);
808 
809       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
810       code << "      // ---- Input field " << i << ": " << field_name << "\n";
811       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
812       // Basis action
813       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
814       switch (eval_mode) {
815         case CEED_EVAL_NONE:
816           bool is_strided;
817 
818           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
819 
820           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
821           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
822           if (is_strided) {
823             bool    has_backend_strides;
824             CeedInt num_elem, elem_size;
825 
826             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
827             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
828             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
829             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
830 
831             if (!has_backend_strides) {
832               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
833             }
834             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
835             code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", " << strides[0] << ", " << strides[1] << ", "
836                  << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
837           } else {
838             CeedSize                  l_size = 0;
839             CeedInt                   comp_stride;
840             CeedElemRestriction_Cuda *rstr_data;
841 
842             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
843             code << "      const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
844             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
845             code << "      // CompStride: " << comp_stride << "\n";
846             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
847             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
848             code << "      ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
849                  << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
850           }
851           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
852           break;
853         case CEED_EVAL_INTERP:
854           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
855           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
856           code << "        r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
857           code << "      }\n";
858           break;
859         case CEED_EVAL_GRAD:
860           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
861           code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
862                << var_suffix << ", r_s" << var_suffix << ");\n";
863           break;
864         case CEED_EVAL_WEIGHT:
865           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
866           code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
867           break;
868           // LCOV_EXCL_START
869         case CEED_EVAL_DIV:
870         case CEED_EVAL_CURL:
871           break;  // TODO: Not implemented
872                   // LCOV_EXCL_STOP
873       }
874     }
875     code << "\n      // -- Output fields\n";
876     for (CeedInt i = 0; i < num_output_fields; i++) {
877       const char *field_name;
878       std::string var_suffix = "_out_" + std::to_string(i);
879 
880       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
881       code << "      // ---- Output field " << i << ": " << field_name << "\n";
882       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
883       // Basis action
884       switch (eval_mode) {
885         case CEED_EVAL_NONE:
886           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
887           break;
888         case CEED_EVAL_INTERP:
889           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
890           break;
891         case CEED_EVAL_GRAD:
892           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
893           break;
894           // LCOV_EXCL_START
895         case CEED_EVAL_WEIGHT:
896           break;  // Should not occur
897         case CEED_EVAL_DIV:
898         case CEED_EVAL_CURL:
899           break;  // TODO: Not implemented
900                   // LCOV_EXCL_STOP
901       }
902     }
903   } else {
904     code << "\n    // Note: Using full elements\n";
905     code << "    {\n";
906     code << "      // -- Input fields\n";
907     for (CeedInt i = 0; i < num_input_fields; i++) {
908       const char *field_name;
909 
910       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
911       code << "      // ---- Input field " << i << ": " << field_name << "\n";
912       code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
913     }
914     code << "      // -- Output fields\n";
915     for (CeedInt i = 0; i < num_output_fields; i++) {
916       const char *field_name;
917 
918       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
919       code << "      // ---- Output field " << i << ": " << field_name << "\n";
920       code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
921     }
922   }
923 
924   // Input and output buffers
925   code << "\n      // -- QFunction inputs and outputs\n";
926   code << "      // ---- Inputs\n";
927   code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
928   for (CeedInt i = 0; i < num_input_fields; i++) {
929     const char *field_name;
930 
931     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
932     code << "      // ------ Input field " << i << ": " << field_name << "\n";
933     code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
934   }
935   code << "      // ---- Outputs\n";
936   code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
937   for (CeedInt i = 0; i < num_output_fields; i++) {
938     const char *field_name;
939 
940     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
941     code << "      // ------ Output field " << i << ": " << field_name << "\n";
942     code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
943   }
944 
945   // Apply QFunction
946   code << "\n      // -- Apply QFunction\n";
947   code << "      " << qfunction_name << "(ctx, ";
948   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
949     code << "1";
950   } else {
951     code << Q_name;
952   }
953   code << ", inputs, outputs);\n";
954 
955   if (is_at_points) {
956     // Map back to coefficients
957     code << "\n      // -- Output fields\n";
958     for (CeedInt i = 0; i < num_output_fields; i++) {
959       const char *field_name;
960       std::string var_suffix = "_out_" + std::to_string(i);
961       std::string P_name     = "P_1d" + var_suffix;
962 
963       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
964       code << "      // ---- Output field " << i << ": " << field_name << "\n";
965       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
966       // Basis action
967       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
968       switch (eval_mode) {
969         case CEED_EVAL_NONE: {
970           CeedInt             comp_stride;
971           CeedElemRestriction elem_rstr;
972 
973           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
974           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
975           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
976           code << "      const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
977           code << "      WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
978                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
979                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
980           break;
981         }
982         case CEED_EVAL_INTERP:
983           code << "      if (i >= points.num_per_elem[elem]) {\n";
984           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
985           code << "      }\n";
986           code << "      InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
987                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
988           break;
989         case CEED_EVAL_GRAD:
990           code << "      if (i >= points.num_per_elem[elem]) {\n";
991           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
992           code << "      }\n";
993           code << "      GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
994                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
995           break;
996           // LCOV_EXCL_START
997         case CEED_EVAL_WEIGHT:
998           break;  // Should not occur
999         case CEED_EVAL_DIV:
1000         case CEED_EVAL_CURL:
1001           break;  // TODO: Not implemented
1002                   // LCOV_EXCL_STOP
1003       }
1004     }
1005   } else if (use_3d_slices) {
1006     // Copy or apply transpose grad, if needed
1007     code << "\n      // -- Output fields\n";
1008     for (CeedInt i = 0; i < num_output_fields; i++) {
1009       const char *field_name;
1010       std::string var_suffix = "_out_" + std::to_string(i);
1011       std::string P_name     = "P_1d" + var_suffix;
1012 
1013       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1014       code << "      // ---- Output field " << i << ": " << field_name << "\n";
1015       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1016       // Basis action
1017       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
1018       switch (eval_mode) {
1019         case CEED_EVAL_NONE:
1020           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1021           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1022           code << "      }\n";
1023           break;
1024         case CEED_EVAL_INTERP:
1025           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1026           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1027           code << "      }\n";
1028           break;
1029         case CEED_EVAL_GRAD:
1030           code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1031                << var_suffix << ", r_q" << var_suffix << ");\n";
1032           break;
1033           // LCOV_EXCL_START
1034         case CEED_EVAL_WEIGHT:
1035           break;  // Should not occur
1036         case CEED_EVAL_DIV:
1037         case CEED_EVAL_CURL:
1038           break;  // TODO: Not implemented
1039                   // LCOV_EXCL_STOP
1040       }
1041     }
1042   }
1043   code << "    }\n";
1044   return CEED_ERROR_SUCCESS;
1045 }
1046 
1047 //------------------------------------------------------------------------------
1048 // Build single operator kernel
1049 //------------------------------------------------------------------------------
1050 extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
1051   bool                    is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
1052   Ceed                    ceed;
1053   CeedInt                 Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1054   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
1055   CeedQFunction_Cuda_gen *qf_data;
1056   CeedQFunction           qf;
1057   CeedOperatorField      *op_input_fields, *op_output_fields;
1058   CeedOperator_Cuda_gen  *data;
1059   std::ostringstream      code;
1060 
1061   CeedCallBackend(CeedOperatorGetData(op, &data));
1062   {
1063     bool is_setup_done;
1064 
1065     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
1066     if (is_setup_done) {
1067       *is_good_build = !data->use_fallback;
1068       return CEED_ERROR_SUCCESS;
1069     }
1070   }
1071 
1072   // Check field compatibility
1073   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1074   {
1075     bool has_shared_bases = true;
1076 
1077     for (CeedInt i = 0; i < num_input_fields; i++) {
1078       CeedBasis basis;
1079 
1080       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1081       if (basis != CEED_BASIS_NONE) {
1082         bool        is_tensor = true;
1083         const char *resource;
1084         char       *resource_root;
1085         Ceed        basis_ceed;
1086 
1087         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1088         is_all_tensor    = is_all_tensor && is_tensor;
1089         is_all_nontensor = is_all_nontensor && !is_tensor;
1090         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1091         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1092         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1093         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
1094         CeedCallBackend(CeedFree(&resource_root));
1095         CeedCallBackend(CeedDestroy(&basis_ceed));
1096       }
1097       CeedCallBackend(CeedBasisDestroy(&basis));
1098     }
1099 
1100     for (CeedInt i = 0; i < num_output_fields; i++) {
1101       CeedBasis basis;
1102 
1103       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1104       if (basis != CEED_BASIS_NONE) {
1105         bool        is_tensor = true;
1106         const char *resource;
1107         char       *resource_root;
1108         Ceed        basis_ceed;
1109 
1110         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1111         is_all_tensor    = is_all_tensor && is_tensor;
1112         is_all_nontensor = is_all_nontensor && !is_tensor;
1113 
1114         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
1115         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
1116         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1117         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
1118         CeedCallBackend(CeedFree(&resource_root));
1119         CeedCallBackend(CeedDestroy(&basis_ceed));
1120       }
1121       CeedCallBackend(CeedBasisDestroy(&basis));
1122     }
1123     // -- Fallback to ref if not all bases are shared
1124     if (!has_shared_bases) {
1125       *is_good_build = false;
1126       return CEED_ERROR_SUCCESS;
1127     }
1128   }
1129   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1130   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1131   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1132   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1133 
1134   // Get operator data
1135   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1136   {
1137     CeedInt max_P = 0, max_P_1d = 0;
1138 
1139     CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
1140                                                          op_output_fields, qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor,
1141                                                          &use_3d_slices));
1142     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
1143   }
1144   if (max_dim == 0) max_dim = 1;
1145   data->dim = max_dim;
1146   if (is_at_points) {
1147     CeedElemRestriction_Cuda *rstr_data;
1148     CeedElemRestriction       rstr_points = NULL;
1149 
1150     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1151     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1152     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1153     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1154     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1155     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1156   }
1157   if (is_at_points) use_3d_slices = false;
1158   if (Q_1d == 0) {
1159     if (is_at_points) Q_1d = max_num_points;
1160     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1161   }
1162   if (Q == 0) Q = Q_1d;
1163   data->Q    = Q;
1164   data->Q_1d = Q_1d;
1165 
1166   // Check for restriction only identity operator
1167   {
1168     bool is_identity_qf;
1169 
1170     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1171     if (is_identity_qf) {
1172       CeedEvalMode eval_mode_in, eval_mode_out;
1173 
1174       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1175       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1176       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1177                 "Backend does not implement restriction only identity operators");
1178     }
1179   }
1180 
1181   // Add atomicAdd function for old NVidia architectures
1182   {
1183     Ceed_Cuda            *ceed_data;
1184     struct cudaDeviceProp prop;
1185 
1186     CeedCallBackend(CeedGetData(ceed, &ceed_data));
1187     CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
1188     if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
1189       code << "// AtomicAdd fallback source\n";
1190       code << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
1191     }
1192   }
1193 
1194   // Load basis source files
1195   if (!is_all_nontensor) {
1196     code << "// Tensor basis source\n";
1197     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
1198   }
1199   if (!is_all_tensor) {
1200     code << "// Non-tensor basis source\n";
1201     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
1202   }
1203   if (!is_all_tensor && !is_all_nontensor) {
1204     code << "// Tensor basis source\n";
1205     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h>\n\n";
1206   }
1207   if (is_at_points) {
1208     code << "// AtPoints basis source\n";
1209     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
1210   }
1211   code << "// CodeGen operator source\n";
1212   code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
1213 
1214   // Get QFunction name
1215   std::string qfunction_name(qf_data->qfunction_name);
1216   std::string operator_name;
1217 
1218   operator_name = "CeedKernelCudaGenOperator_" + qfunction_name;
1219 
1220   // Define CEED_Q_VLA
1221   code << "\n#undef CEED_Q_VLA\n";
1222   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1223     code << "#define CEED_Q_VLA 1\n\n";
1224   } else {
1225     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1226   }
1227 
1228   // Add user QFunction source
1229   {
1230     const char *source_path;
1231 
1232     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1233     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
1234 
1235     code << "// User QFunction source\n";
1236     code << "#include \"" << source_path << "\"\n\n";
1237   }
1238 
1239   // Setup
1240   code << "\n// -----------------------------------------------------------------------------\n";
1241   code << "// Operator Kernel\n";
1242   code << "// \n";
1243   code << "// d_[in,out]_i:   CeedVector device array\n";
1244   code << "// r_[in,out]_e_i: Element vector register\n";
1245   code << "// r_[in,out]_q_i: Quadrature space vector register\n";
1246   code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1247   code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1248   code << "// \n";
1249   code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1250   code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1251   code << "// -----------------------------------------------------------------------------\n";
1252   code << "extern \"C\" __global__ void " << operator_name
1253        << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
1254           "points) {\n";
1255 
1256   // Scratch buffers
1257   for (CeedInt i = 0; i < num_input_fields; i++) {
1258     CeedEvalMode eval_mode;
1259 
1260     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1261     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1262       code << "  const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1263     }
1264   }
1265   for (CeedInt i = 0; i < num_output_fields; i++) {
1266     code << "  CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1267   }
1268 
1269   code << "  const CeedInt max_dim = " << max_dim << ";\n";
1270   if (!is_all_tensor) {
1271     code << "  const CeedInt Q = " << Q << ";\n";
1272   }
1273   if (!is_all_nontensor) {
1274     code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
1275   }
1276   if (is_at_points) {
1277     code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
1278     code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1279   }
1280 
1281   // Shared data
1282   code << "  extern __shared__ CeedScalar slice[];\n";
1283   code << "  SharedData_Cuda data;\n";
1284   code << "  data.t_id_x = threadIdx.x;\n";
1285   code << "  data.t_id_y = threadIdx.y;\n";
1286   code << "  data.t_id_z = threadIdx.z;\n";
1287   code << "  data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1288   code << "  data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1289 
1290   // -- Determine input mat reuse
1291   FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX];
1292 
1293   for (CeedInt i = 0; i < num_input_fields; i++) {
1294     input_matrix_reuse[i].index = -1;
1295   }
1296   for (CeedInt i = 0; i < num_input_fields; i++) {
1297     bool         is_tensor = true;
1298     CeedEvalMode eval_mode_i;
1299     CeedBasis    basis_i;
1300 
1301     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1302     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1303     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1304     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1305     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1306       CeedEvalMode eval_mode_j;
1307       CeedBasis    basis_j;
1308 
1309       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1310       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1311       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1312       if (basis_i == basis_j) {
1313         if (is_tensor) {
1314           input_matrix_reuse[i].index     = j;
1315           input_matrix_reuse[i].is_input  = true;
1316           input_matrix_reuse[i].eval_mode = eval_mode_j;
1317         } else {
1318           // For non-tensor can only re-use with the same eval mode
1319           if (eval_mode_i == eval_mode_j) {
1320             input_matrix_reuse[i].index     = j;
1321             input_matrix_reuse[i].is_input  = true;
1322             input_matrix_reuse[i].eval_mode = eval_mode_j;
1323           }
1324         }
1325       }
1326       CeedCallBackend(CeedBasisDestroy(&basis_j));
1327     }
1328     CeedCallBackend(CeedBasisDestroy(&basis_i));
1329   }
1330 
1331   // -- Determine output mat reuse
1332   FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX];
1333 
1334   for (CeedInt i = 0; i < num_output_fields; i++) {
1335     output_matrix_reuse[i].index = -1;
1336   }
1337   for (CeedInt i = 0; i < num_output_fields; i++) {
1338     bool         is_tensor = true;
1339     CeedEvalMode eval_mode_i;
1340     CeedBasis    basis_i;
1341 
1342     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1343     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1344     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
1345     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1346       CeedEvalMode eval_mode_j;
1347       CeedBasis    basis_j;
1348 
1349       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1350       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1351       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1352       if (basis_i == basis_j) {
1353         if (is_tensor) {
1354           output_matrix_reuse[i].index     = j;
1355           output_matrix_reuse[i].is_input  = true;
1356           output_matrix_reuse[i].eval_mode = eval_mode_j;
1357         } else {
1358           // For non-tensor can only re-use with the same eval mode
1359           if (eval_mode_i == eval_mode_j) {
1360             output_matrix_reuse[i].index     = j;
1361             output_matrix_reuse[i].is_input  = true;
1362             output_matrix_reuse[i].eval_mode = eval_mode_j;
1363           }
1364         }
1365       }
1366       CeedCallBackend(CeedBasisDestroy(&basis_j));
1367     }
1368     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1369       CeedEvalMode eval_mode_j;
1370       CeedBasis    basis_j;
1371 
1372       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1373       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1374       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1375       if (basis_i == basis_j) {
1376         if (is_tensor) {
1377           output_matrix_reuse[i].index     = j;
1378           output_matrix_reuse[i].is_input  = false;
1379           output_matrix_reuse[i].eval_mode = eval_mode_j;
1380         } else {
1381           // For non-tensor can only re-use with the same eval mode
1382           if (eval_mode_i == eval_mode_j) {
1383             output_matrix_reuse[i].index     = j;
1384             output_matrix_reuse[i].is_input  = false;
1385             output_matrix_reuse[i].eval_mode = eval_mode_j;
1386           }
1387         }
1388       }
1389       CeedCallBackend(CeedBasisDestroy(&basis_j));
1390     }
1391     CeedCallBackend(CeedBasisDestroy(&basis_i));
1392   }
1393 
1394   // Initialize constants, and matrices B and G
1395   code << "\n  // Input field constants and basis data\n";
1396   for (CeedInt i = 0; i < num_input_fields; i++) {
1397     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], max_dim,
1398                                                               Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1399   }
1400   code << "\n  // Output field constants and basis data\n";
1401   for (CeedInt i = 0; i < num_output_fields; i++) {
1402     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1403                                                               max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1404   }
1405 
1406   // Loop over all elements
1407   code << "\n  // Element loop\n";
1408   code << "  __syncthreads();\n";
1409   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1410 
1411   // -- Compute minimum buffer space needed
1412   CeedInt max_rstr_buffer_size = 1;
1413 
1414   for (CeedInt i = 0; i < num_input_fields; i++) {
1415     CeedEvalMode eval_mode;
1416 
1417     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1418     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1419       CeedInt             num_comp;
1420       CeedElemRestriction elem_rstr;
1421 
1422       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1423       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1424       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1425       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1426     }
1427   }
1428   for (CeedInt i = 0; i < num_output_fields; i++) {
1429     CeedEvalMode eval_mode;
1430 
1431     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1432     if (eval_mode != CEED_EVAL_NONE) {
1433       CeedInt             num_comp;
1434       CeedElemRestriction elem_rstr;
1435 
1436       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1437       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1438       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1439       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1440     }
1441   }
1442   code << "    // Scratch restriction buffer space\n";
1443   code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1444 
1445   // -- Determine best input field processing order
1446   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1447 
1448   for (CeedInt i = 0; i < num_input_fields; i++) {
1449     field_rstr_in_buffer[i] = -1;
1450     input_field_order[i]    = -1;
1451   }
1452   {
1453     bool    is_ordered[CEED_FIELD_MAX];
1454     CeedInt curr_index = 0;
1455 
1456     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1457     for (CeedInt i = 0; i < num_input_fields; i++) {
1458       CeedVector          vec_i;
1459       CeedElemRestriction rstr_i;
1460 
1461       if (is_ordered[i]) continue;
1462       field_rstr_in_buffer[i]       = i;
1463       is_ordered[i]                 = true;
1464       input_field_order[curr_index] = i;
1465       curr_index++;
1466       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1467       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1468       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1469       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1470         CeedVector          vec_j;
1471         CeedElemRestriction rstr_j;
1472 
1473         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1474         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1475         if (rstr_i == rstr_j && vec_i == vec_j) {
1476           field_rstr_in_buffer[j]       = i;
1477           is_ordered[j]                 = true;
1478           input_field_order[curr_index] = j;
1479           curr_index++;
1480         }
1481         CeedCallBackend(CeedVectorDestroy(&vec_j));
1482         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1483       }
1484       CeedCallBackend(CeedVectorDestroy(&vec_i));
1485       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1486     }
1487   }
1488 
1489   // -- Input restriction and basis
1490   code << "\n    // -- Input field restrictions and basis actions\n";
1491   for (CeedInt i = 0; i < num_input_fields; i++) {
1492     const char   *field_name;
1493     const CeedInt f = input_field_order[i];
1494 
1495     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1496     code << "    // ---- Input field " << f << ": " << field_name << "\n";
1497 
1498     // ---- Restriction
1499     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], max_dim,
1500                                                                 Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1501 
1502     // ---- Basis action
1503     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, is_all_tensor,
1504                                                           is_at_points, use_3d_slices));
1505   }
1506 
1507   // -- Q function
1508   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, max_dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
1509                                                             num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d,
1510                                                             is_all_tensor, is_at_points, use_3d_slices));
1511 
1512   // -- Output basis and restriction
1513   code << "\n    // -- Output field basis action and restrictions\n";
1514   for (CeedInt i = 0; i < num_output_fields; i++) {
1515     const char *field_name;
1516 
1517     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1518     code << "    // ---- Output field " << i << ": " << field_name << "\n";
1519 
1520     // ---- Basis action
1521     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
1522                                                           is_all_tensor, is_at_points, use_3d_slices));
1523 
1524     // ---- Restriction
1525     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
1526                                                                 is_all_tensor, is_at_points, use_3d_slices));
1527   }
1528 
1529   // Close loop and function
1530   code << "  }\n";
1531   code << "}\n";
1532   code << "// -----------------------------------------------------------------------------\n\n";
1533 
1534   // Compile
1535   {
1536     bool          is_compile_good = false;
1537     const CeedInt T_1d            = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d);
1538 
1539     data->thread_1d = T_1d;
1540     CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d));
1541     if (is_compile_good) {
1542       *is_good_build = true;
1543       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
1544     } else {
1545       *is_good_build     = false;
1546       data->use_fallback = true;
1547     }
1548   }
1549   CeedCallBackend(CeedOperatorSetSetupDone(op));
1550   CeedCallBackend(CeedDestroy(&ceed));
1551   CeedCallBackend(CeedQFunctionDestroy(&qf));
1552   return CEED_ERROR_SUCCESS;
1553 }
1554 
1555 //------------------------------------------------------------------------------
1556