xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision c9192aca9c02dc42a6a7d7a897b4af02df4a189e)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/jit-tools.h>
13 #include <cuda_runtime.h>
14 
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 
19 #include "../cuda-ref/ceed-cuda-ref.h"
20 #include "../cuda-shared/ceed-cuda-shared.h"
21 #include "../cuda/ceed-cuda-common.h"
22 #include "../cuda/ceed-cuda-compile.h"
23 #include "ceed-cuda-gen.h"
24 
25 //------------------------------------------------------------------------------
26 // Determine type of operator
27 //------------------------------------------------------------------------------
28 static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
29                                                 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
30                                                 CeedQFunctionField *qf_output_fields, CeedInt *max_P_1d, CeedInt *Q_1d, CeedInt *dim, bool *is_tensor,
31                                                 bool *use_3d_slices) {
32   // Find dim, P_1d, Q_1d
33   *max_P_1d  = 0;
34   *Q_1d      = 0;
35   *dim       = 0;
36   *is_tensor = true;
37 
38   for (CeedInt i = 0; i < num_input_fields; i++) {
39     CeedBasis basis;
40 
41     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
42     if (basis != CEED_BASIS_NONE) {
43       bool    is_field_tensor;
44       CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
45 
46       // Collect dim, P_1d, and Q_1d
47       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
48       *is_tensor = *is_tensor && is_field_tensor;
49       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
50       else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
51       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
52       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
53       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
54       *dim = field_dim;
55       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
56       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
57       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
58       *Q_1d = field_Q_1d;
59     }
60     CeedCallBackend(CeedBasisDestroy(&basis));
61   }
62   for (CeedInt i = 0; i < num_output_fields; i++) {
63     CeedBasis basis;
64 
65     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
66     if (basis != CEED_BASIS_NONE) {
67       bool    is_field_tensor;
68       CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
69 
70       // Collect dim, P_1d, and Q_1d
71       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
72       *is_tensor = *is_tensor && is_field_tensor;
73       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
74       else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d));
75       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
76       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
77       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
78       *dim = field_dim;
79       if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
80       else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d));
81       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
82       *Q_1d = field_Q_1d;
83     }
84     CeedCallBackend(CeedBasisDestroy(&basis));
85   }
86 
87   // Only use 3D collocated gradient parallelization strategy when gradient is computed
88   *use_3d_slices = false;
89   if (*dim == 3) {
90     bool was_grad_found = false;
91 
92     for (CeedInt i = 0; i < num_input_fields; i++) {
93       CeedEvalMode eval_mode;
94 
95       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
96       if (eval_mode == CEED_EVAL_GRAD) {
97         CeedBasis_Cuda_shared *basis_data;
98         CeedBasis              basis;
99 
100         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
101         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
102         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
103         was_grad_found = true;
104         CeedCallBackend(CeedBasisDestroy(&basis));
105       }
106     }
107     for (CeedInt i = 0; i < num_output_fields; i++) {
108       CeedEvalMode eval_mode;
109 
110       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
111       if (eval_mode == CEED_EVAL_GRAD) {
112         CeedBasis_Cuda_shared *basis_data;
113         CeedBasis              basis;
114 
115         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
116         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
117         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
118         was_grad_found = true;
119         CeedCallBackend(CeedBasisDestroy(&basis));
120       }
121     }
122   }
123   return CEED_ERROR_SUCCESS;
124 }
125 
126 //------------------------------------------------------------------------------
127 // Setup fields
128 //------------------------------------------------------------------------------
129 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
130                                                      CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points,
131                                                      bool use_3d_slices) {
132   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
133   std::string            P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
134   std::string            option_name = (is_input ? "inputs" : "outputs");
135   CeedEvalMode           eval_mode   = CEED_EVAL_NONE;
136   CeedInt                elem_size = 0, num_comp = 0, P_1d = 0;
137   CeedElemRestriction    elem_rstr;
138   CeedBasis_Cuda_shared *basis_data;
139   CeedBasis              basis;
140 
141   code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
142 
143   // Get field data
144   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
145   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
146     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
147     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
148   }
149   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
150   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
151   if (basis != CEED_BASIS_NONE) {
152     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
153     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
154     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
155   }
156   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
157 
158   // Set field constants
159   if (eval_mode != CEED_EVAL_WEIGHT) {
160     code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
161     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
162   }
163 
164   // Load basis data
165   code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
166   switch (eval_mode) {
167     case CEED_EVAL_NONE:
168       break;
169     case CEED_EVAL_INTERP:
170       if (is_at_points) {
171         // AtPoints
172         if (!basis_data->d_chebyshev_interp_1d) {
173           CeedSize    interp_bytes;
174           CeedScalar *chebyshev_interp_1d;
175 
176           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
177           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
178           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
179           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
180           CeedCallCuda(CeedBasisReturnCeed(basis),
181                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
182           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
183         }
184         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
185         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
186       } else {
187         // Standard quadrature
188         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
189         else data->B.outputs[i] = basis_data->d_interp_1d;
190       }
191       code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
192       code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
193       break;
194     case CEED_EVAL_GRAD:
195       if (is_at_points) {
196         // AtPoints
197         if (!basis_data->d_chebyshev_interp_1d) {
198           CeedSize    interp_bytes;
199           CeedScalar *chebyshev_interp_1d;
200 
201           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
202           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
203           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
204           CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
205           CeedCallCuda(CeedBasisReturnCeed(basis),
206                        cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
207           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
208         }
209         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
210         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
211       } else {
212         // Standard quadrature
213         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
214         else data->B.outputs[i] = basis_data->d_interp_1d;
215       }
216       if (is_tensor) {
217         code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
218         code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
219       }
220       if (is_at_points) break;  // No G mat for AtPoints
221       if (use_3d_slices) {
222         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
223         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
224         code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
225         code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
226       } else {
227         bool has_collo_grad = basis_data->d_collo_grad_1d;
228 
229         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
230         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
231         if (has_collo_grad) {
232           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
233           code << "  LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
234         } else {
235           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
236           code << "  LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
237                << var_suffix << ");\n";
238         }
239       }
240       break;
241     case CEED_EVAL_WEIGHT:
242       break;  // No action
243       // LCOV_EXCL_START
244     case CEED_EVAL_DIV:
245     case CEED_EVAL_CURL:
246       break;  // TODO: Not implemented
247               // LCOV_EXCL_STOP
248   }
249   CeedCallBackend(CeedBasisDestroy(&basis));
250   return CEED_ERROR_SUCCESS;
251 }
252 
253 //------------------------------------------------------------------------------
254 // Restriction
255 //------------------------------------------------------------------------------
256 static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
257                                                        CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
258                                                        CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points, bool use_3d_slices) {
259   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
260   std::string               P_name     = (is_tensor ? "P_1d" : "P") + var_suffix;
261   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
262   CeedInt                   elem_size = 0, num_comp = 0, P_1d = 0;
263   CeedSize                  l_size;
264   CeedRestrictionType       rstr_type = CEED_RESTRICTION_STANDARD;
265   CeedElemRestriction_Cuda *rstr_data;
266   CeedElemRestriction       elem_rstr;
267   CeedBasis                 basis;
268 
269   // Get field data
270   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
271   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
272     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
273     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
274     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
275     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
276   }
277   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
278   if (basis != CEED_BASIS_NONE) {
279     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
280     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
281   }
282   CeedCallBackend(CeedBasisDestroy(&basis));
283   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
284 
285   // Restriction
286   if (is_input) {
287     // Input
288     if (field_input_buffer[i] != i) {
289       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
290 
291       // Restriction was already done for previous input
292       code << "    CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
293     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
294       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
295         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
296         code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
297       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
298         // Otherwise we're using the scratch space
299         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
300       }
301       switch (rstr_type) {
302         case CEED_RESTRICTION_STANDARD: {
303           CeedInt comp_stride;
304 
305           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
306           code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
307           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
308           code << "    // CompStride: " << comp_stride << "\n";
309           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
310           code << "    ReadLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
311                << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
312           break;
313         }
314         case CEED_RESTRICTION_STRIDED: {
315           bool    has_backend_strides;
316           CeedInt num_elem;
317 
318           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
319           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
320           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
321 
322           if (!has_backend_strides) {
323             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
324           }
325           code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
326           code << "    ReadLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
327                << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
328           break;
329         }
330         case CEED_RESTRICTION_POINTS: {
331           CeedInt comp_stride;
332 
333           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
334           code << "    const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
335           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
336           break;
337         }
338         // LCOV_EXCL_START
339         case CEED_RESTRICTION_ORIENTED:
340         case CEED_RESTRICTION_CURL_ORIENTED:
341           break;  // TODO: Not implemented
342                   // LCOV_EXCL_STOP
343       }
344     }
345   } else {
346     // Output
347     switch (rstr_type) {
348       case CEED_RESTRICTION_STANDARD: {
349         CeedInt comp_stride;
350 
351         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
352         code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
353         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
354         code << "    // CompStride: " << comp_stride << "\n";
355         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
356         code << "    WriteLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name
357              << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
358         break;
359       }
360       case CEED_RESTRICTION_STRIDED: {
361         bool    has_backend_strides;
362         CeedInt num_elem;
363 
364         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
365         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
366         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
367 
368         if (!has_backend_strides) {
369           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
370         }
371         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
372         code << "    WriteLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", "
373              << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
374         break;
375       }
376       case CEED_RESTRICTION_POINTS:
377         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
378         break;
379       // LCOV_EXCL_START
380       case CEED_RESTRICTION_ORIENTED:
381       case CEED_RESTRICTION_CURL_ORIENTED:
382         break;  // TODO: Not implemented
383                 // LCOV_EXCL_STOP
384     }
385   }
386   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
387   return CEED_ERROR_SUCCESS;
388 }
389 
390 //------------------------------------------------------------------------------
391 // Basis
392 //------------------------------------------------------------------------------
393 static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
394                                                  CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor,
395                                                  bool is_at_points, bool use_3d_slices) {
396   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
397   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
398   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
399   CeedInt             elem_size = 0, num_comp = 0, P_1d = 0;
400   CeedElemRestriction elem_rstr;
401   CeedBasis           basis;
402 
403   // Get field data
404   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
405   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
406     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
407     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
408   }
409   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
410   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
411   if (basis != CEED_BASIS_NONE) {
412     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
413     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
414   }
415   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
416 
417   // Basis
418   code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
419   if (is_input) {
420     switch (eval_mode) {
421       case CEED_EVAL_NONE:
422         if (!use_3d_slices && !is_at_points) {
423           code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
424         }
425         break;
426       case CEED_EVAL_INTERP:
427         if (is_at_points) {
428           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
429 
430           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
431           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
432                << var_suffix << ", r_c" << var_suffix << ");\n";
433         } else {
434           std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d") : "InterpNonTensor";
435 
436           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
437           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
438                << var_suffix << ", r_q" << var_suffix << ");\n";
439         }
440         break;
441       case CEED_EVAL_GRAD:
442         if (is_at_points) {
443           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
444 
445           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
446           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
447                << var_suffix << ", r_c" << var_suffix << ");\n";
448         } else if (use_3d_slices) {
449           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
450 
451           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
452           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
453                << var_suffix << ", r_q" << var_suffix << ");\n";
454         } else if (is_tensor) {
455           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
456           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d";
457 
458           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (dim >= 3 ? Q_name : "1") << "];\n";
459           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B"
460                << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
461         } else {
462           std::string function_name = "GradNonTensor";
463 
464           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
465           code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix
466                << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
467         }
468         break;
469       case CEED_EVAL_WEIGHT: {
470         if (is_at_points) {
471           code << "    // Nothing to do AtPoints\n";
472         } else {
473           CeedBasis_Cuda_shared *basis_data;
474           std::string            function_name = is_tensor ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d") : "WeightNonTensor";
475 
476           code << "    CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
477           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
478           data->W = basis_data->d_q_weight_1d;
479           code << "    " << function_name << "<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
480         }
481         break;
482       }
483       // LCOV_EXCL_START
484       case CEED_EVAL_DIV:
485       case CEED_EVAL_CURL:
486         break;  // TODO: Not implemented
487                 // LCOV_EXCL_STOP
488     }
489   } else {
490     switch (eval_mode) {
491       case CEED_EVAL_NONE:
492         code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
493         break;  // No action
494       case CEED_EVAL_INTERP:
495         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
496         if (is_at_points) {
497           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
498 
499           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
500                << var_suffix << ", r_e" << var_suffix << ");\n";
501         } else {
502           std::string function_name =
503               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d") : "InterpTransposeNonTensor";
504 
505           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
506                << var_suffix << ", r_e" << var_suffix << ");\n";
507         }
508         break;
509       case CEED_EVAL_GRAD:
510         code << "    CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
511         if (is_at_points) {
512           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
513 
514           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B"
515                << var_suffix << ", r_e" << var_suffix << ");\n";
516         } else if (use_3d_slices) {
517           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
518 
519           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
520                << var_suffix << ", r_e" << var_suffix << ");\n";
521         } else if (is_tensor) {
522           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
523           std::string function_name =
524               (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) + std::to_string(dim) + "d";
525 
526           code << "    " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B"
527                << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
528         } else {
529           std::string function_name = "GradTransposeNonTensor";
530 
531           code << "    " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix
532                << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
533         }
534         break;
535       // LCOV_EXCL_START
536       case CEED_EVAL_WEIGHT:
537         break;  // Should not occur
538       case CEED_EVAL_DIV:
539       case CEED_EVAL_CURL:
540         break;  // TODO: Not implemented
541                 // LCOV_EXCL_STOP
542     }
543   }
544   CeedCallBackend(CeedBasisDestroy(&basis));
545   return CEED_ERROR_SUCCESS;
546 }
547 
548 //------------------------------------------------------------------------------
549 // QFunction
550 //------------------------------------------------------------------------------
551 static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt dim, CeedInt max_num_points,
552                                                      CeedInt num_input_fields, CeedOperatorField *op_input_fields,
553                                                      CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
554                                                      CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
555                                                      std::string qfunction_name, CeedInt Q_1d, bool is_tensor, bool is_at_points,
556                                                      bool use_3d_slices) {
557   std::string         Q_name    = is_tensor ? "Q_1d" : "Q";
558   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
559   CeedElemRestriction elem_rstr;
560 
561   // Setup output arrays
562   code << "\n    // -- Output field setup\n";
563   for (CeedInt i = 0; i < num_output_fields; i++) {
564     std::string var_suffix = "_out_" + std::to_string(i);
565 
566     code << "    // ---- Output field " << i << "\n";
567     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
568     switch (eval_mode) {
569       case CEED_EVAL_NONE:
570         if (is_at_points) {
571           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
572         } else {
573           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
574         }
575         break;
576       case CEED_EVAL_INTERP:
577         if (is_at_points) {
578           // Accumulator for point data
579           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
580           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
581           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
582           code << "    }\n";
583         } else {
584           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
585         }
586         break;
587       case CEED_EVAL_GRAD:
588         if (is_at_points) {
589           // Accumulator for point data
590           code << "    CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "*dim];\n";
591           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n";
592           code << "      r_c" << var_suffix << "[i] = 0.0;\n";
593           code << "    }\n";
594         } else if (use_3d_slices) {
595           // Accumulator for gradient slices
596           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
597           code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
598           code << "      r_q" << var_suffix << "[i] = 0.0;\n";
599           code << "    }\n";
600         } else {
601           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
602         }
603         break;
604       case CEED_EVAL_WEIGHT:
605         break;
606         // LCOV_EXCL_START
607       case CEED_EVAL_DIV:
608       case CEED_EVAL_CURL:
609         break;  // TODO: Not implemented
610                 // LCOV_EXCL_STOP
611     }
612   }
613 
614   if (is_at_points) {
615     // We need to handle batches of points
616     code << "\n    // Note: Using batches of points\n";
617     code << "    const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));\n\n";
618     code << "    #pragma unroll\n";
619     code << "    for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {\n";
620     code << "      const CeedInt p = i % max_num_points;\n\n";
621 
622     code << "      // -- Coordinates\n";
623     code << "      CeedScalar r_x[dim];\n";
624     code << "      ReadPoint<dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
625 
626     code << "      // -- Input fields\n";
627     for (CeedInt i = 0; i < num_input_fields; i++) {
628       std::string var_suffix = "_in_" + std::to_string(i);
629 
630       code << "      // ---- Input field " << i << "\n";
631       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
632       // Basis action
633       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
634       switch (eval_mode) {
635         case CEED_EVAL_NONE:
636           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
637           code << "      ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
638                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
639           break;
640         case CEED_EVAL_INTERP:
641           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
642           code << "      InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
643                << ", r_x, r_s" << var_suffix << ");\n";
644           break;
645         case CEED_EVAL_GRAD:
646           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
647           code << "      GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix
648                << ", r_x, r_s" << var_suffix << ");\n";
649           break;
650         case CEED_EVAL_WEIGHT:
651           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
652           code << "      r_s" << var_suffix << "[0] = 1.0;\n";
653           break;
654           // LCOV_EXCL_START
655         case CEED_EVAL_DIV:
656         case CEED_EVAL_CURL:
657           break;  // TODO: Not implemented
658                   // LCOV_EXCL_STOP
659       }
660     }
661     code << "\n      // -- Output fields\n";
662     for (CeedInt i = 0; i < num_output_fields; i++) {
663       std::string var_suffix = "_out_" + std::to_string(i);
664 
665       code << "      // ---- Output field " << i << "\n";
666       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
667       // Basis action
668       switch (eval_mode) {
669         case CEED_EVAL_NONE:
670           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
671           break;
672         case CEED_EVAL_INTERP:
673           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
674           break;
675         case CEED_EVAL_GRAD:
676           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
677           break;
678           // LCOV_EXCL_START
679         case CEED_EVAL_WEIGHT:
680           break;  // Should not occur
681         case CEED_EVAL_DIV:
682         case CEED_EVAL_CURL:
683           break;  // TODO: Not implemented
684                   // LCOV_EXCL_STOP
685       }
686     }
687 
688   } else if (use_3d_slices) {
689     // We treat quadrature points per slice in 3d to save registers
690     code << "\n    // Note: Using planes of 3D elements\n";
691     code << "    #pragma unroll\n";
692     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
693     code << "      // -- Input fields\n";
694     for (CeedInt i = 0; i < num_input_fields; i++) {
695       std::string var_suffix = "_in_" + std::to_string(i);
696 
697       code << "      // ---- Input field " << i << "\n";
698       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
699       // Basis action
700       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
701       switch (eval_mode) {
702         case CEED_EVAL_NONE:
703           bool is_strided;
704 
705           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
706 
707           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
708           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
709           if (is_strided) {
710             bool    has_backend_strides;
711             CeedInt num_elem, elem_size;
712 
713             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
714             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
715             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
716             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
717 
718             if (!has_backend_strides) {
719               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
720             }
721             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
722             code << "      ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", " << strides[0] << ", " << strides[1] << ", "
723                  << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
724           } else {
725             CeedSize                  l_size = 0;
726             CeedInt                   comp_stride;
727             CeedElemRestriction_Cuda *rstr_data;
728 
729             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
730             code << "      const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
731             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
732             code << "      // CompStride: " << comp_stride << "\n";
733             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
734             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
735             code << "      ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
736                  << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
737           }
738           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
739           break;
740         case CEED_EVAL_INTERP:
741           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
742           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
743           code << "        r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
744           code << "      }\n";
745           break;
746         case CEED_EVAL_GRAD:
747           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
748           code << "      GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix
749                << ", r_s" << var_suffix << ");\n";
750           break;
751         case CEED_EVAL_WEIGHT:
752           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
753           code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
754           break;
755           // LCOV_EXCL_START
756         case CEED_EVAL_DIV:
757         case CEED_EVAL_CURL:
758           break;  // TODO: Not implemented
759                   // LCOV_EXCL_STOP
760       }
761     }
762     code << "\n      // -- Output fields\n";
763     for (CeedInt i = 0; i < num_output_fields; i++) {
764       std::string var_suffix = "_out_" + std::to_string(i);
765 
766       code << "      // ---- Output field " << i << "\n";
767       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
768       // Basis action
769       switch (eval_mode) {
770         case CEED_EVAL_NONE:
771           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
772           break;
773         case CEED_EVAL_INTERP:
774           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
775           break;
776         case CEED_EVAL_GRAD:
777           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
778           break;
779           // LCOV_EXCL_START
780         case CEED_EVAL_WEIGHT:
781           break;  // Should not occur
782         case CEED_EVAL_DIV:
783         case CEED_EVAL_CURL:
784           break;  // TODO: Not implemented
785                   // LCOV_EXCL_STOP
786       }
787     }
788   } else {
789     code << "\n    // Note: Using full elements\n";
790     code << "    {\n";
791     code << "      // -- Input fields\n";
792     for (CeedInt i = 0; i < num_input_fields; i++) {
793       code << "      // ---- Input field " << i << "\n";
794       code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
795     }
796     code << "      // -- Output fields\n";
797     for (CeedInt i = 0; i < num_output_fields; i++) {
798       code << "      // ---- Output field " << i << "\n";
799       code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
800     }
801   }
802 
803   // Input and output buffers
804   code << "\n      // -- QFunction inputs and outputs\n";
805   code << "      // ---- Inputs\n";
806   code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
807   for (CeedInt i = 0; i < num_input_fields; i++) {
808     code << "      // ------ Input field " << i << "\n";
809     code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
810   }
811   code << "      // ---- Outputs\n";
812   code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
813   for (CeedInt i = 0; i < num_output_fields; i++) {
814     code << "      // ------ Output field " << i << "\n";
815     code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
816   }
817 
818   // Apply QFunction
819   code << "\n      // -- Apply QFunction\n";
820   code << "      " << qfunction_name << "(ctx, ";
821   if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
822     code << "1";
823   } else {
824     code << Q_name;
825   }
826   code << ", inputs, outputs);\n";
827 
828   if (is_at_points) {
829     // Map back to coefficients
830     code << "\n      // -- Output fields\n";
831     for (CeedInt i = 0; i < num_output_fields; i++) {
832       std::string var_suffix = "_out_" + std::to_string(i);
833       std::string P_name     = "P_1d" + var_suffix;
834 
835       code << "      // ---- Output field " << i << "\n";
836       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
837       // Basis action
838       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
839       switch (eval_mode) {
840         case CEED_EVAL_NONE: {
841           CeedInt             comp_stride;
842           CeedElemRestriction elem_rstr;
843 
844           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
845           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
846           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
847           code << "      const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
848           code << "      WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
849                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
850                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
851           break;
852         }
853         case CEED_EVAL_INTERP:
854           code << "      if (i >= points.num_per_elem[elem]) {\n";
855           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
856           code << "      }\n";
857           code << "      InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
858                << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
859           break;
860         case CEED_EVAL_GRAD:
861           code << "      if (i >= points.num_per_elem[elem]) {\n";
862           code << "        for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n";
863           code << "      }\n";
864           code << "      GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s"
865                << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
866           break;
867           // LCOV_EXCL_START
868         case CEED_EVAL_WEIGHT:
869           break;  // Should not occur
870         case CEED_EVAL_DIV:
871         case CEED_EVAL_CURL:
872           break;  // TODO: Not implemented
873                   // LCOV_EXCL_STOP
874       }
875     }
876   } else if (use_3d_slices) {
877     // Copy or apply transpose grad, if needed
878     code << "\n      // -- Output fields\n";
879     for (CeedInt i = 0; i < num_output_fields; i++) {
880       std::string var_suffix = "_out_" + std::to_string(i);
881       std::string P_name     = "P_1d" + var_suffix;
882 
883       code << "      // ---- Output field " << i << "\n";
884       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
885       // Basis action
886       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
887       switch (eval_mode) {
888         case CEED_EVAL_NONE:
889           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
890           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
891           code << "      }\n";
892           break;
893         case CEED_EVAL_INTERP:
894           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
895           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
896           code << "      }\n";
897           break;
898         case CEED_EVAL_GRAD:
899           code << "      GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G"
900                << var_suffix << ", r_q" << var_suffix << ");\n";
901           break;
902           // LCOV_EXCL_START
903         case CEED_EVAL_WEIGHT:
904           break;  // Should not occur
905         case CEED_EVAL_DIV:
906         case CEED_EVAL_CURL:
907           break;  // TODO: Not implemented
908                   // LCOV_EXCL_STOP
909       }
910     }
911   }
912   code << "    }\n";
913   return CEED_ERROR_SUCCESS;
914 }
915 
916 //------------------------------------------------------------------------------
917 // Build single operator kernel
918 //------------------------------------------------------------------------------
919 extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
920   bool                    is_tensor = true, is_at_points = false, use_3d_slices = false;
921   Ceed                    ceed;
922   CeedInt                 Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
923   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
924   CeedQFunction_Cuda_gen *qf_data;
925   CeedQFunction           qf;
926   CeedOperatorField      *op_input_fields, *op_output_fields;
927   CeedOperator_Cuda_gen  *data;
928   std::ostringstream      code;
929 
930   CeedCallBackend(CeedOperatorGetData(op, &data));
931   {
932     bool is_setup_done;
933 
934     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
935     if (is_setup_done) {
936       *is_good_build = !data->use_fallback;
937       return CEED_ERROR_SUCCESS;
938     }
939   }
940 
941   // Check field compatibility
942   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
943   {
944     bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
945 
946     for (CeedInt i = 0; i < num_input_fields; i++) {
947       CeedBasis basis;
948 
949       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
950       if (basis != CEED_BASIS_NONE) {
951         bool        is_tensor = true;
952         const char *resource;
953         char       *resource_root;
954         Ceed        basis_ceed;
955 
956         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
957         is_all_tensor    = is_all_tensor && is_tensor;
958         is_all_nontensor = is_all_not_tensor && !is_tensor;
959         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
960         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
961         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
962         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
963         CeedCallBackend(CeedFree(&resource_root));
964         CeedCallBackend(CeedDestroy(&basis_ceed));
965       }
966       CeedCallBackend(CeedBasisDestroy(&basis));
967     }
968 
969     for (CeedInt i = 0; i < num_output_fields; i++) {
970       CeedBasis basis;
971 
972       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
973       if (basis != CEED_BASIS_NONE) {
974         bool        is_tensor = true;
975         const char *resource;
976         char       *resource_root;
977         Ceed        basis_ceed;
978 
979         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
980         is_all_tensor    = is_all_tensor && is_tensor;
981         is_all_nontensor = is_all_nontensor && !is_tensor;
982 
983         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
984         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
985         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
986         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
987         CeedCallBackend(CeedFree(&resource_root));
988         CeedCallBackend(CeedDestroy(&basis_ceed));
989       }
990       CeedCallBackend(CeedBasisDestroy(&basis));
991     }
992     // -- Fallback to ref if not all bases are shared
993     if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
994       *is_good_build = false;
995       return CEED_ERROR_SUCCESS;
996     }
997   }
998   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
999   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1000   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1001   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1002 
1003   // Get operator data
1004   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1005   CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
1006                                                        qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
1007   if (dim == 0) dim = 1;
1008   data->dim = dim;
1009   if (is_at_points) {
1010     CeedElemRestriction_Cuda *rstr_data;
1011     CeedElemRestriction       rstr_points = NULL;
1012 
1013     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1014     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1015     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1016     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
1017     data->points.indices = (CeedInt *)rstr_data->d_offsets;
1018     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1019   }
1020   if (is_at_points) use_3d_slices = false;
1021   if (Q_1d == 0) {
1022     if (is_at_points) Q_1d = max_num_points;
1023     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
1024   }
1025   data->Q_1d = Q_1d;
1026 
1027   // Check for restriction only identity operator
1028   {
1029     bool is_identity_qf;
1030 
1031     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
1032     if (is_identity_qf) {
1033       CeedEvalMode eval_mode_in, eval_mode_out;
1034 
1035       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
1036       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
1037       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
1038                 "Backend does not implement restriction only identity operators");
1039     }
1040   }
1041 
1042   // Add atomicAdd function for old NVidia architectures
1043   {
1044     Ceed_Cuda            *ceed_data;
1045     struct cudaDeviceProp prop;
1046 
1047     CeedCallBackend(CeedGetData(ceed, &ceed_data));
1048     CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
1049     if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
1050       code << "// AtomicAdd fallback source\n";
1051       code << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n";
1052     }
1053   }
1054 
1055   // Load basis source files
1056   if (is_tensor) {
1057     code << "// Tensor basis source\n";
1058     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
1059   } else {
1060     code << "// Non-tensor basis source\n";
1061     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n";
1062   }
1063   if (is_at_points) {
1064     code << "// AtPoints basis source\n";
1065     code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
1066   }
1067   code << "// CodeGen operator source\n";
1068   code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
1069 
1070   // Get QFunction name
1071   std::string qfunction_name(qf_data->qfunction_name);
1072   std::string operator_name;
1073 
1074   operator_name = "CeedKernelCudaGenOperator_" + qfunction_name;
1075 
1076   // Define CEED_Q_VLA
1077   code << "\n#undef CEED_Q_VLA\n";
1078   if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) {
1079     code << "#define CEED_Q_VLA 1\n\n";
1080   } else {
1081     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
1082   }
1083 
1084   // Add user QFunction source
1085   {
1086     const char *source_path;
1087 
1088     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1089     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file");
1090 
1091     code << "// User QFunction source\n";
1092     code << "#include \"" << source_path << "\"\n\n";
1093   }
1094 
1095   // Setup
1096   code << "\n// -----------------------------------------------------------------------------\n";
1097   code << "// Operator Kernel\n";
1098   code << "// \n";
1099   code << "// d_[in,out]_i:   CeedVector device array\n";
1100   code << "// r_[in,out]_e_i: Element vector register\n";
1101   code << "// r_[in,out]_q_i: Quadrature space vector register\n";
1102   code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1103   code << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1104   code << "// \n";
1105   code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1106   code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1107   code << "// -----------------------------------------------------------------------------\n";
1108   code << "extern \"C\" __global__ void " << operator_name
1109        << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
1110           "points) {\n";
1111 
1112   // Scratch buffers
1113   for (CeedInt i = 0; i < num_input_fields; i++) {
1114     CeedEvalMode eval_mode;
1115 
1116     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1117     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1118       code << "  const CeedScalar *d_in_" << i << " = fields.inputs[" << i << "];\n";
1119     }
1120   }
1121   for (CeedInt i = 0; i < num_output_fields; i++) {
1122     code << "  CeedScalar *d_out_" << i << " = fields.outputs[" << i << "];\n";
1123   }
1124 
1125   code << "  const CeedInt dim = " << dim << ";\n";
1126   code << "  const CeedInt " << (is_tensor ? "Q_1d" : "Q") << " = " << Q_1d << ";\n";
1127   if (is_at_points) {
1128     code << "  const CeedInt max_num_points = " << max_num_points << ";\n";
1129     code << "  const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1130   }
1131 
1132   // Shared data
1133   code << "  extern __shared__ CeedScalar slice[];\n";
1134   code << "  SharedData_Cuda data;\n";
1135   code << "  data.t_id_x = threadIdx.x;\n";
1136   code << "  data.t_id_y = threadIdx.y;\n";
1137   code << "  data.t_id_z = threadIdx.z;\n";
1138   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1139   code << "  data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";
1140 
1141   // Initialize constants, and matrices B and G
1142   code << "\n  // Input field constants and basis data\n";
1143   for (CeedInt i = 0; i < num_input_fields; i++) {
1144     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_tensor,
1145                                                               is_at_points, use_3d_slices));
1146   }
1147   code << "\n  // Output field constants and basis data\n";
1148   for (CeedInt i = 0; i < num_output_fields; i++) {
1149     CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
1150                                                               is_at_points, use_3d_slices));
1151   }
1152 
1153   // Loop over all elements
1154   code << "\n  // Element loop\n";
1155   code << "  __syncthreads();\n";
1156   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1157 
1158   // -- Compute minimum buffer space needed
1159   CeedInt max_rstr_buffer_size = 1;
1160 
1161   for (CeedInt i = 0; i < num_input_fields; i++) {
1162     CeedInt             num_comp, elem_size;
1163     CeedElemRestriction elem_rstr;
1164 
1165     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1166     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1167     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1168     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
1169     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1170   }
1171   for (CeedInt i = 0; i < num_output_fields; i++) {
1172     CeedInt             num_comp, elem_size;
1173     CeedElemRestriction elem_rstr;
1174 
1175     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1176     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1177     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
1178     max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1));
1179     CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1180   }
1181   code << "    // Scratch restriction buffer space\n";
1182   code << "    CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1183 
1184   // -- Determine best input field processing order
1185   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1186 
1187   for (CeedInt i = 0; i < num_input_fields; i++) {
1188     field_rstr_in_buffer[i] = -1;
1189     input_field_order[i]    = -1;
1190   }
1191   {
1192     bool    is_ordered[CEED_FIELD_MAX];
1193     CeedInt curr_index = 0;
1194 
1195     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1196     for (CeedInt i = 0; i < num_input_fields; i++) {
1197       CeedVector          vec_i;
1198       CeedElemRestriction rstr_i;
1199 
1200       if (is_ordered[i]) continue;
1201       field_rstr_in_buffer[i]       = i;
1202       is_ordered[i]                 = true;
1203       input_field_order[curr_index] = i;
1204       curr_index++;
1205       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1206       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1207       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1208       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1209         CeedVector          vec_j;
1210         CeedElemRestriction rstr_j;
1211 
1212         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1213         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1214         if (rstr_i == rstr_j && vec_i == vec_j) {
1215           field_rstr_in_buffer[j]       = i;
1216           is_ordered[j]                 = true;
1217           input_field_order[curr_index] = j;
1218           curr_index++;
1219         }
1220         CeedCallBackend(CeedVectorDestroy(&vec_j));
1221         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1222       }
1223       CeedCallBackend(CeedVectorDestroy(&vec_i));
1224       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1225     }
1226   }
1227 
1228   // -- Input restriction and basis
1229   code << "\n    // -- Input field restrictions and basis actions\n";
1230   for (CeedInt i = 0; i < num_input_fields; i++) {
1231     CeedInt f = input_field_order[i];
1232 
1233     code << "    // ---- Input field " << f << "\n";
1234 
1235     // ---- Restriction
1236     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1237                                                                 Q_1d, true, is_tensor, is_at_points, use_3d_slices));
1238 
1239     // ---- Basis action
1240     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_tensor,
1241                                                           is_at_points, use_3d_slices));
1242   }
1243 
1244   // -- Q function
1245   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields,
1246                                                             num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_tensor,
1247                                                             is_at_points, use_3d_slices));
1248 
1249   // -- Output basis and restriction
1250   code << "\n    // -- Output field basis action and restrictions\n";
1251   for (CeedInt i = 0; i < num_output_fields; i++) {
1252     code << "    // ---- Output field " << i << "\n";
1253 
1254     // ---- Basis action
1255     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
1256                                                           is_at_points, use_3d_slices));
1257 
1258     // ---- Restriction
1259     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
1260                                                                 is_tensor, is_at_points, use_3d_slices));
1261   }
1262 
1263   // Close loop and function
1264   code << "  }\n";
1265   code << "}\n";
1266   code << "// -----------------------------------------------------------------------------\n\n";
1267 
1268   // Compile
1269   {
1270     bool is_compile_good = false;
1271 
1272     CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
1273     if (is_compile_good) {
1274       *is_good_build = true;
1275       CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
1276     } else {
1277       *is_good_build     = false;
1278       data->use_fallback = true;
1279     }
1280   }
1281   CeedCallBackend(CeedOperatorSetSetupDone(op));
1282   CeedCallBackend(CeedDestroy(&ceed));
1283   CeedCallBackend(CeedQFunctionDestroy(&qf));
1284   return CEED_ERROR_SUCCESS;
1285 }
1286 
1287 //------------------------------------------------------------------------------
1288