xref: /libCEED/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp (revision 4b3e95d5da7d4d1a3e8f0acb9d6f6cc36918381b)
15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3241a4b83SYohann //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5241a4b83SYohann //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
73d576824SJeremy L Thompson 
8b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12
97df94212SJeremy L Thompson 
1049aac155SJeremy L Thompson #include <ceed.h>
11ec3da8bcSJed Brown #include <ceed/backend.h>
129e201c85SYohann #include <ceed/jit-tools.h>
133d576824SJeremy L Thompson #include <cuda_runtime.h>
142b730f8bSJeremy L Thompson 
15241a4b83SYohann #include <iostream>
16241a4b83SYohann #include <sstream>
17b2165e7aSSebastian Grimberg #include <string>
182b730f8bSJeremy L Thompson 
190d0321e0SJeremy L Thompson #include "../cuda-ref/ceed-cuda-ref.h"
20241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h"
2149aac155SJeremy L Thompson #include "../cuda/ceed-cuda-common.h"
222b730f8bSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h"
232b730f8bSJeremy L Thompson #include "ceed-cuda-gen.h"
24241a4b83SYohann 
25ab213215SJeremy L Thompson //------------------------------------------------------------------------------
26*4b3e95d5SJeremy L Thompson // Determine type of operator
27*4b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
28*4b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
29*4b3e95d5SJeremy L Thompson                                                 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
30*4b3e95d5SJeremy L Thompson                                                 CeedQFunctionField *qf_output_fields, CeedInt *max_P_1d, CeedInt *Q_1d, CeedInt *dim, bool *is_tensor,
31*4b3e95d5SJeremy L Thompson                                                 bool *use_3d_slices) {
32*4b3e95d5SJeremy L Thompson   // Find dim, P_1d, Q_1d
33*4b3e95d5SJeremy L Thompson   *max_P_1d  = 0;
34*4b3e95d5SJeremy L Thompson   *Q_1d      = 0;
35*4b3e95d5SJeremy L Thompson   *dim       = 0;
36*4b3e95d5SJeremy L Thompson   *is_tensor = true;
37*4b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
38*4b3e95d5SJeremy L Thompson     CeedBasis basis;
39*4b3e95d5SJeremy L Thompson 
40*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
41*4b3e95d5SJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
42*4b3e95d5SJeremy L Thompson       bool    is_field_tensor;
43*4b3e95d5SJeremy L Thompson       CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
44*4b3e95d5SJeremy L Thompson 
45*4b3e95d5SJeremy L Thompson       // Collect dim, P_1d, and Q_1d
46*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
47*4b3e95d5SJeremy L Thompson       CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
48*4b3e95d5SJeremy L Thompson       *is_tensor = *is_tensor && is_field_tensor;
49*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
50*4b3e95d5SJeremy L Thompson       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
51*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
52*4b3e95d5SJeremy L Thompson       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
53*4b3e95d5SJeremy L Thompson       *dim = field_dim;
54*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
55*4b3e95d5SJeremy L Thompson       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
56*4b3e95d5SJeremy L Thompson       *Q_1d = field_Q_1d;
57*4b3e95d5SJeremy L Thompson     }
58*4b3e95d5SJeremy L Thompson   }
59*4b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
60*4b3e95d5SJeremy L Thompson     CeedBasis basis;
61*4b3e95d5SJeremy L Thompson 
62*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
63*4b3e95d5SJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
64*4b3e95d5SJeremy L Thompson       bool    is_field_tensor;
65*4b3e95d5SJeremy L Thompson       CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0;
66*4b3e95d5SJeremy L Thompson 
67*4b3e95d5SJeremy L Thompson       // Collect dim, P_1d, and Q_1d
68*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
69*4b3e95d5SJeremy L Thompson       CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
70*4b3e95d5SJeremy L Thompson       *is_tensor = *is_tensor && is_field_tensor;
71*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
72*4b3e95d5SJeremy L Thompson       *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
73*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
74*4b3e95d5SJeremy L Thompson       CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
75*4b3e95d5SJeremy L Thompson       *dim = field_dim;
76*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
77*4b3e95d5SJeremy L Thompson       CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
78*4b3e95d5SJeremy L Thompson       *Q_1d = field_Q_1d;
79*4b3e95d5SJeremy L Thompson     }
80*4b3e95d5SJeremy L Thompson   }
81*4b3e95d5SJeremy L Thompson 
82*4b3e95d5SJeremy L Thompson   // Only use 3D collocated gradient parallelization strategy when gradient is computed
83*4b3e95d5SJeremy L Thompson   *use_3d_slices = false;
84*4b3e95d5SJeremy L Thompson   if (*dim == 3) {
85*4b3e95d5SJeremy L Thompson     bool was_grad_found = false;
86*4b3e95d5SJeremy L Thompson 
87*4b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
88*4b3e95d5SJeremy L Thompson       CeedEvalMode eval_mode;
89*4b3e95d5SJeremy L Thompson 
90*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
91*4b3e95d5SJeremy L Thompson       if (eval_mode == CEED_EVAL_GRAD) {
92*4b3e95d5SJeremy L Thompson         CeedBasis_Cuda_shared *basis_data;
93*4b3e95d5SJeremy L Thompson         CeedBasis              basis;
94*4b3e95d5SJeremy L Thompson 
95*4b3e95d5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
96*4b3e95d5SJeremy L Thompson         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
97*4b3e95d5SJeremy L Thompson         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
98*4b3e95d5SJeremy L Thompson         was_grad_found = true;
99*4b3e95d5SJeremy L Thompson       }
100*4b3e95d5SJeremy L Thompson     }
101*4b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
102*4b3e95d5SJeremy L Thompson       CeedEvalMode eval_mode;
103*4b3e95d5SJeremy L Thompson 
104*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
105*4b3e95d5SJeremy L Thompson       if (eval_mode == CEED_EVAL_GRAD) {
106*4b3e95d5SJeremy L Thompson         CeedBasis_Cuda_shared *basis_data;
107*4b3e95d5SJeremy L Thompson         CeedBasis              basis;
108*4b3e95d5SJeremy L Thompson 
109*4b3e95d5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
110*4b3e95d5SJeremy L Thompson         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
111*4b3e95d5SJeremy L Thompson         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
112*4b3e95d5SJeremy L Thompson         was_grad_found = true;
113*4b3e95d5SJeremy L Thompson       }
114*4b3e95d5SJeremy L Thompson     }
115*4b3e95d5SJeremy L Thompson   }
116*4b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
117*4b3e95d5SJeremy L Thompson }
118*4b3e95d5SJeremy L Thompson 
119*4b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
120*4b3e95d5SJeremy L Thompson // Setup fields
121*4b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
122*4b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
123*4b3e95d5SJeremy L Thompson                                                      CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool use_3d_slices) {
124*4b3e95d5SJeremy L Thompson   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
125*4b3e95d5SJeremy L Thompson   std::string            P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
126*4b3e95d5SJeremy L Thompson   std::string            option_name = (is_input ? "inputs" : "outputs");
127*4b3e95d5SJeremy L Thompson   CeedEvalMode           eval_mode   = CEED_EVAL_NONE;
128*4b3e95d5SJeremy L Thompson   CeedInt                elem_size = 0, num_comp = 0, P_1d = 0;
129*4b3e95d5SJeremy L Thompson   CeedElemRestriction    elem_rstr;
130*4b3e95d5SJeremy L Thompson   CeedBasis_Cuda_shared *basis_data;
131*4b3e95d5SJeremy L Thompson   CeedBasis              basis;
132*4b3e95d5SJeremy L Thompson 
133*4b3e95d5SJeremy L Thompson   code << "  // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";
134*4b3e95d5SJeremy L Thompson 
135*4b3e95d5SJeremy L Thompson   // Get field data
136*4b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
137*4b3e95d5SJeremy L Thompson   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
138*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
139*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
140*4b3e95d5SJeremy L Thompson   }
141*4b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
142*4b3e95d5SJeremy L Thompson   if (basis != CEED_BASIS_NONE) {
143*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
144*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
145*4b3e95d5SJeremy L Thompson   }
146*4b3e95d5SJeremy L Thompson   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
147*4b3e95d5SJeremy L Thompson 
148*4b3e95d5SJeremy L Thompson   // Set field constants
149*4b3e95d5SJeremy L Thompson   if (eval_mode != CEED_EVAL_WEIGHT) {
150*4b3e95d5SJeremy L Thompson     code << "  const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
151*4b3e95d5SJeremy L Thompson     code << "  const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
152*4b3e95d5SJeremy L Thompson   }
153*4b3e95d5SJeremy L Thompson 
154*4b3e95d5SJeremy L Thompson   // Load basis data
155*4b3e95d5SJeremy L Thompson   code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
156*4b3e95d5SJeremy L Thompson   switch (eval_mode) {
157*4b3e95d5SJeremy L Thompson     case CEED_EVAL_NONE:
158*4b3e95d5SJeremy L Thompson       break;
159*4b3e95d5SJeremy L Thompson     case CEED_EVAL_INTERP:
160*4b3e95d5SJeremy L Thompson       if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
161*4b3e95d5SJeremy L Thompson       else data->B.outputs[i] = basis_data->d_interp_1d;
162*4b3e95d5SJeremy L Thompson       code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
163*4b3e95d5SJeremy L Thompson       code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
164*4b3e95d5SJeremy L Thompson       break;
165*4b3e95d5SJeremy L Thompson     case CEED_EVAL_GRAD:
166*4b3e95d5SJeremy L Thompson       if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
167*4b3e95d5SJeremy L Thompson       else data->B.outputs[i] = basis_data->d_interp_1d;
168*4b3e95d5SJeremy L Thompson       code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
169*4b3e95d5SJeremy L Thompson       code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
170*4b3e95d5SJeremy L Thompson       if (use_3d_slices) {
171*4b3e95d5SJeremy L Thompson         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
172*4b3e95d5SJeremy L Thompson         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
173*4b3e95d5SJeremy L Thompson         code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
174*4b3e95d5SJeremy L Thompson         code << "  loadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
175*4b3e95d5SJeremy L Thompson       } else {
176*4b3e95d5SJeremy L Thompson         bool has_collo_grad = basis_data->d_collo_grad_1d;
177*4b3e95d5SJeremy L Thompson 
178*4b3e95d5SJeremy L Thompson         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
179*4b3e95d5SJeremy L Thompson         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
180*4b3e95d5SJeremy L Thompson         if (has_collo_grad) {
181*4b3e95d5SJeremy L Thompson           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n";
182*4b3e95d5SJeremy L Thompson           code << "  loadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
183*4b3e95d5SJeremy L Thompson         } else {
184*4b3e95d5SJeremy L Thompson           code << "  __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * P_1d << "];\n";
185*4b3e95d5SJeremy L Thompson           code << "  loadMatrix<" << P_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
186*4b3e95d5SJeremy L Thompson         }
187*4b3e95d5SJeremy L Thompson       }
188*4b3e95d5SJeremy L Thompson       break;
189*4b3e95d5SJeremy L Thompson     case CEED_EVAL_WEIGHT:
190*4b3e95d5SJeremy L Thompson       break;  // No action
191*4b3e95d5SJeremy L Thompson       // LCOV_EXCL_START
192*4b3e95d5SJeremy L Thompson     case CEED_EVAL_DIV:
193*4b3e95d5SJeremy L Thompson       break;  // TODO: Not implemented
194*4b3e95d5SJeremy L Thompson     case CEED_EVAL_CURL:
195*4b3e95d5SJeremy L Thompson       break;  // TODO: Not implemented
196*4b3e95d5SJeremy L Thompson               // LCOV_EXCL_STOP
197*4b3e95d5SJeremy L Thompson   }
198*4b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
199*4b3e95d5SJeremy L Thompson }
200*4b3e95d5SJeremy L Thompson 
201*4b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
202*4b3e95d5SJeremy L Thompson // Restriction
203*4b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
204*4b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
205*4b3e95d5SJeremy L Thompson                                                        CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
206*4b3e95d5SJeremy L Thompson                                                        bool use_3d_slices) {
207*4b3e95d5SJeremy L Thompson   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
208*4b3e95d5SJeremy L Thompson   std::string               P_name     = "P_1d" + var_suffix;
209*4b3e95d5SJeremy L Thompson   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
210*4b3e95d5SJeremy L Thompson   CeedInt                   elem_size = 0, num_comp = 0, P_1d = 0;
211*4b3e95d5SJeremy L Thompson   CeedSize                  l_size;
212*4b3e95d5SJeremy L Thompson   CeedElemRestriction_Cuda *rstr_data;
213*4b3e95d5SJeremy L Thompson   CeedElemRestriction       elem_rstr;
214*4b3e95d5SJeremy L Thompson   CeedBasis                 basis;
215*4b3e95d5SJeremy L Thompson 
216*4b3e95d5SJeremy L Thompson   // Get field data
217*4b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
218*4b3e95d5SJeremy L Thompson   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
219*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
220*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
221*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
222*4b3e95d5SJeremy L Thompson   }
223*4b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
224*4b3e95d5SJeremy L Thompson   if (basis != CEED_BASIS_NONE) {
225*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
226*4b3e95d5SJeremy L Thompson   }
227*4b3e95d5SJeremy L Thompson   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
228*4b3e95d5SJeremy L Thompson 
229*4b3e95d5SJeremy L Thompson   // Restriction
230*4b3e95d5SJeremy L Thompson   if (is_input) {
231*4b3e95d5SJeremy L Thompson     // Input
232*4b3e95d5SJeremy L Thompson     if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
233*4b3e95d5SJeremy L Thompson       bool is_strided;
234*4b3e95d5SJeremy L Thompson 
235*4b3e95d5SJeremy L Thompson       code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
236*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
237*4b3e95d5SJeremy L Thompson       if (!is_strided) {
238*4b3e95d5SJeremy L Thompson         CeedInt comp_stride;
239*4b3e95d5SJeremy L Thompson 
240*4b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
241*4b3e95d5SJeremy L Thompson         code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
242*4b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
243*4b3e95d5SJeremy L Thompson         code << "    // CompStride: " << comp_stride << "\n";
244*4b3e95d5SJeremy L Thompson         data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
245*4b3e95d5SJeremy L Thompson         code << "    readDofsOffset" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" << var_suffix
246*4b3e95d5SJeremy L Thompson              << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n";
247*4b3e95d5SJeremy L Thompson       } else {
248*4b3e95d5SJeremy L Thompson         bool    has_backend_strides;
249*4b3e95d5SJeremy L Thompson         CeedInt num_elem;
250*4b3e95d5SJeremy L Thompson 
251*4b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
252*4b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
253*4b3e95d5SJeremy L Thompson         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
254*4b3e95d5SJeremy L Thompson 
255*4b3e95d5SJeremy L Thompson         if (!has_backend_strides) {
256*4b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
257*4b3e95d5SJeremy L Thompson         }
258*4b3e95d5SJeremy L Thompson         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
259*4b3e95d5SJeremy L Thompson         code << "    readDofsStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
260*4b3e95d5SJeremy L Thompson              << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n";
261*4b3e95d5SJeremy L Thompson       }
262*4b3e95d5SJeremy L Thompson     }
263*4b3e95d5SJeremy L Thompson   } else {
264*4b3e95d5SJeremy L Thompson     // Output
265*4b3e95d5SJeremy L Thompson     bool is_strided;
266*4b3e95d5SJeremy L Thompson 
267*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
268*4b3e95d5SJeremy L Thompson     if (!is_strided) {
269*4b3e95d5SJeremy L Thompson       CeedInt comp_stride;
270*4b3e95d5SJeremy L Thompson 
271*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
272*4b3e95d5SJeremy L Thompson       code << "    const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
273*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
274*4b3e95d5SJeremy L Thompson       code << "    // CompStride: " << comp_stride << "\n";
275*4b3e95d5SJeremy L Thompson       data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
276*4b3e95d5SJeremy L Thompson       code << "    writeDofsOffset" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" << var_suffix
277*4b3e95d5SJeremy L Thompson            << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n";
278*4b3e95d5SJeremy L Thompson     } else {
279*4b3e95d5SJeremy L Thompson       bool    has_backend_strides;
280*4b3e95d5SJeremy L Thompson       CeedInt num_elem;
281*4b3e95d5SJeremy L Thompson 
282*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
283*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
284*4b3e95d5SJeremy L Thompson       CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
285*4b3e95d5SJeremy L Thompson 
286*4b3e95d5SJeremy L Thompson       if (!has_backend_strides) {
287*4b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
288*4b3e95d5SJeremy L Thompson       }
289*4b3e95d5SJeremy L Thompson       code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
290*4b3e95d5SJeremy L Thompson       code << "    writeDofsStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << ","
291*4b3e95d5SJeremy L Thompson            << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n";
292*4b3e95d5SJeremy L Thompson     }
293*4b3e95d5SJeremy L Thompson   }
294*4b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
295*4b3e95d5SJeremy L Thompson }
296*4b3e95d5SJeremy L Thompson 
297*4b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
298*4b3e95d5SJeremy L Thompson // Basis
299*4b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
300*4b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
301*4b3e95d5SJeremy L Thompson                                                  CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
302*4b3e95d5SJeremy L Thompson                                                  bool use_3d_slices) {
303*4b3e95d5SJeremy L Thompson   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
304*4b3e95d5SJeremy L Thompson   std::string         P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
305*4b3e95d5SJeremy L Thompson   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
306*4b3e95d5SJeremy L Thompson   CeedInt             elem_size = 0, num_comp = 0, P_1d = 0;
307*4b3e95d5SJeremy L Thompson   CeedElemRestriction elem_rstr;
308*4b3e95d5SJeremy L Thompson   CeedBasis           basis;
309*4b3e95d5SJeremy L Thompson 
310*4b3e95d5SJeremy L Thompson   // Get field data
311*4b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
312*4b3e95d5SJeremy L Thompson   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
313*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
314*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
315*4b3e95d5SJeremy L Thompson   }
316*4b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
317*4b3e95d5SJeremy L Thompson   if (basis != CEED_BASIS_NONE) {
318*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
319*4b3e95d5SJeremy L Thompson   }
320*4b3e95d5SJeremy L Thompson   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
321*4b3e95d5SJeremy L Thompson 
322*4b3e95d5SJeremy L Thompson   // Basis
323*4b3e95d5SJeremy L Thompson   code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
324*4b3e95d5SJeremy L Thompson   if (is_input) {
325*4b3e95d5SJeremy L Thompson     switch (eval_mode) {
326*4b3e95d5SJeremy L Thompson       case CEED_EVAL_NONE:
327*4b3e95d5SJeremy L Thompson         if (!use_3d_slices) {
328*4b3e95d5SJeremy L Thompson           code << "    CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
329*4b3e95d5SJeremy L Thompson         }
330*4b3e95d5SJeremy L Thompson         break;
331*4b3e95d5SJeremy L Thompson       case CEED_EVAL_INTERP:
332*4b3e95d5SJeremy L Thompson         code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
333*4b3e95d5SJeremy L Thompson         code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
334*4b3e95d5SJeremy L Thompson              << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
335*4b3e95d5SJeremy L Thompson         break;
336*4b3e95d5SJeremy L Thompson       case CEED_EVAL_GRAD:
337*4b3e95d5SJeremy L Thompson         if (use_3d_slices) {
338*4b3e95d5SJeremy L Thompson           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
339*4b3e95d5SJeremy L Thompson           code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name
340*4b3e95d5SJeremy L Thompson                << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
341*4b3e95d5SJeremy L Thompson         } else {
342*4b3e95d5SJeremy L Thompson           code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
343*4b3e95d5SJeremy L Thompson           code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp" << var_suffix
344*4b3e95d5SJeremy L Thompson                << ", P_1d" << var_suffix << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q"
345*4b3e95d5SJeremy L Thompson                << var_suffix << ");\n";
346*4b3e95d5SJeremy L Thompson         }
347*4b3e95d5SJeremy L Thompson         break;
348*4b3e95d5SJeremy L Thompson       case CEED_EVAL_WEIGHT: {
349*4b3e95d5SJeremy L Thompson         CeedBasis_Cuda_shared *basis_data;
350*4b3e95d5SJeremy L Thompson 
351*4b3e95d5SJeremy L Thompson         code << "    CeedScalar r_q" << var_suffix << "[" << Q_name << "];\n";
352*4b3e95d5SJeremy L Thompson         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
353*4b3e95d5SJeremy L Thompson         data->W = basis_data->d_q_weight_1d;
354*4b3e95d5SJeremy L Thompson         code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
355*4b3e95d5SJeremy L Thompson         break;
356*4b3e95d5SJeremy L Thompson       }
357*4b3e95d5SJeremy L Thompson       // LCOV_EXCL_START
358*4b3e95d5SJeremy L Thompson       case CEED_EVAL_DIV:
359*4b3e95d5SJeremy L Thompson         break;  // TODO: Not implemented
360*4b3e95d5SJeremy L Thompson       case CEED_EVAL_CURL:
361*4b3e95d5SJeremy L Thompson         break;  // TODO: Not implemented
362*4b3e95d5SJeremy L Thompson                 // LCOV_EXCL_STOP
363*4b3e95d5SJeremy L Thompson     }
364*4b3e95d5SJeremy L Thompson   } else {
365*4b3e95d5SJeremy L Thompson     switch (eval_mode) {
366*4b3e95d5SJeremy L Thompson       case CEED_EVAL_NONE:
367*4b3e95d5SJeremy L Thompson         code << "    CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
368*4b3e95d5SJeremy L Thompson         break;  // No action
369*4b3e95d5SJeremy L Thompson       case CEED_EVAL_INTERP:
370*4b3e95d5SJeremy L Thompson         code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
371*4b3e95d5SJeremy L Thompson         code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
372*4b3e95d5SJeremy L Thompson              << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
373*4b3e95d5SJeremy L Thompson         break;
374*4b3e95d5SJeremy L Thompson       case CEED_EVAL_GRAD:
375*4b3e95d5SJeremy L Thompson         code << "    CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
376*4b3e95d5SJeremy L Thompson         if (use_3d_slices) {
377*4b3e95d5SJeremy L Thompson           code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
378*4b3e95d5SJeremy L Thompson                << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
379*4b3e95d5SJeremy L Thompson         } else {
380*4b3e95d5SJeremy L Thompson           code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp"
381*4b3e95d5SJeremy L Thompson                << var_suffix << ", " << P_name << "," << Q_name << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix
382*4b3e95d5SJeremy L Thompson                << ", r_e" << var_suffix << ");\n";
383*4b3e95d5SJeremy L Thompson         }
384*4b3e95d5SJeremy L Thompson         break;
385*4b3e95d5SJeremy L Thompson       // LCOV_EXCL_START
386*4b3e95d5SJeremy L Thompson       case CEED_EVAL_WEIGHT:
387*4b3e95d5SJeremy L Thompson         break;  // Should not occur
388*4b3e95d5SJeremy L Thompson       case CEED_EVAL_DIV:
389*4b3e95d5SJeremy L Thompson         break;  // TODO: Not implemented
390*4b3e95d5SJeremy L Thompson       case CEED_EVAL_CURL:
391*4b3e95d5SJeremy L Thompson         break;  // TODO: Not implemented
392*4b3e95d5SJeremy L Thompson                 // LCOV_EXCL_STOP
393*4b3e95d5SJeremy L Thompson     }
394*4b3e95d5SJeremy L Thompson   }
395*4b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
396*4b3e95d5SJeremy L Thompson }
397*4b3e95d5SJeremy L Thompson 
398*4b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
399*4b3e95d5SJeremy L Thompson // QFunction
400*4b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
401*4b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt dim, CeedInt num_input_fields,
402*4b3e95d5SJeremy L Thompson                                                      CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
403*4b3e95d5SJeremy L Thompson                                                      CeedInt num_output_fields, CeedOperatorField *op_output_fields,
404*4b3e95d5SJeremy L Thompson                                                      CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d,
405*4b3e95d5SJeremy L Thompson                                                      bool use_3d_slices) {
406*4b3e95d5SJeremy L Thompson   std::string         Q_name    = "Q_1d";
407*4b3e95d5SJeremy L Thompson   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
408*4b3e95d5SJeremy L Thompson   CeedElemRestriction elem_rstr;
409*4b3e95d5SJeremy L Thompson 
410*4b3e95d5SJeremy L Thompson   // Setup output arays
411*4b3e95d5SJeremy L Thompson   code << "\n    // -- Output field setup\n";
412*4b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
413*4b3e95d5SJeremy L Thompson     std::string var_suffix = "_out_" + std::to_string(i);
414*4b3e95d5SJeremy L Thompson 
415*4b3e95d5SJeremy L Thompson     code << "    // ---- Output field " << i << "\n";
416*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
417*4b3e95d5SJeremy L Thompson     if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
418*4b3e95d5SJeremy L Thompson       code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
419*4b3e95d5SJeremy L Thompson     }
420*4b3e95d5SJeremy L Thompson     if (eval_mode == CEED_EVAL_GRAD) {
421*4b3e95d5SJeremy L Thompson       if (use_3d_slices) {
422*4b3e95d5SJeremy L Thompson         // Accumulator for gradient slices
423*4b3e95d5SJeremy L Thompson         code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
424*4b3e95d5SJeremy L Thompson         code << "    for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n";
425*4b3e95d5SJeremy L Thompson         code << "      r_q" << var_suffix << "[i] = 0.0;\n";
426*4b3e95d5SJeremy L Thompson         code << "    }\n";
427*4b3e95d5SJeremy L Thompson       } else {
428*4b3e95d5SJeremy L Thompson         code << "    CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n";
429*4b3e95d5SJeremy L Thompson       }
430*4b3e95d5SJeremy L Thompson     }
431*4b3e95d5SJeremy L Thompson   }
432*4b3e95d5SJeremy L Thompson 
433*4b3e95d5SJeremy L Thompson   // We treat quadrature points per slice in 3d to save registers
434*4b3e95d5SJeremy L Thompson   if (use_3d_slices) {
435*4b3e95d5SJeremy L Thompson     code << "\n    // Note: Using planes of 3D elements\n";
436*4b3e95d5SJeremy L Thompson     code << "#pragma unroll\n";
437*4b3e95d5SJeremy L Thompson     code << "    for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
438*4b3e95d5SJeremy L Thompson     code << "      // -- Input fields\n";
439*4b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
440*4b3e95d5SJeremy L Thompson       std::string var_suffix = "_in_" + std::to_string(i);
441*4b3e95d5SJeremy L Thompson 
442*4b3e95d5SJeremy L Thompson       code << "      // ---- Input field " << i << "\n";
443*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
444*4b3e95d5SJeremy L Thompson       // Basis action
445*4b3e95d5SJeremy L Thompson       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
446*4b3e95d5SJeremy L Thompson       switch (eval_mode) {
447*4b3e95d5SJeremy L Thompson         case CEED_EVAL_NONE:
448*4b3e95d5SJeremy L Thompson           bool is_strided;
449*4b3e95d5SJeremy L Thompson 
450*4b3e95d5SJeremy L Thompson           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
451*4b3e95d5SJeremy L Thompson 
452*4b3e95d5SJeremy L Thompson           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
453*4b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
454*4b3e95d5SJeremy L Thompson           if (is_strided) {
455*4b3e95d5SJeremy L Thompson             bool    has_backend_strides;
456*4b3e95d5SJeremy L Thompson             CeedInt num_elem, elem_size;
457*4b3e95d5SJeremy L Thompson 
458*4b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
459*4b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
460*4b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
461*4b3e95d5SJeremy L Thompson             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
462*4b3e95d5SJeremy L Thompson 
463*4b3e95d5SJeremy L Thompson             if (!has_backend_strides) {
464*4b3e95d5SJeremy L Thompson               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
465*4b3e95d5SJeremy L Thompson             }
466*4b3e95d5SJeremy L Thompson             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
467*4b3e95d5SJeremy L Thompson             code << "      readSliceQuadsStrided3d<num_comp" << var_suffix << ", " << Q_name << "," << strides[0] << "," << strides[1] << ","
468*4b3e95d5SJeremy L Thompson                  << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
469*4b3e95d5SJeremy L Thompson           } else {
470*4b3e95d5SJeremy L Thompson             CeedSize                  l_size = 0;
471*4b3e95d5SJeremy L Thompson             CeedInt                   comp_stride;
472*4b3e95d5SJeremy L Thompson             CeedElemRestriction_Cuda *rstr_data;
473*4b3e95d5SJeremy L Thompson 
474*4b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
475*4b3e95d5SJeremy L Thompson             code << "      const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
476*4b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
477*4b3e95d5SJeremy L Thompson             code << "      // CompStride: " << comp_stride << "\n";
478*4b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
479*4b3e95d5SJeremy L Thompson             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
480*4b3e95d5SJeremy L Thompson             code << "      readSliceQuadsOffset3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix
481*4b3e95d5SJeremy L Thompson                  << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
482*4b3e95d5SJeremy L Thompson           }
483*4b3e95d5SJeremy L Thompson           break;
484*4b3e95d5SJeremy L Thompson         case CEED_EVAL_INTERP:
485*4b3e95d5SJeremy L Thompson           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
486*4b3e95d5SJeremy L Thompson           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
487*4b3e95d5SJeremy L Thompson           code << "        r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
488*4b3e95d5SJeremy L Thompson           code << "      }\n";
489*4b3e95d5SJeremy L Thompson           break;
490*4b3e95d5SJeremy L Thompson         case CEED_EVAL_GRAD:
491*4b3e95d5SJeremy L Thompson           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
492*4b3e95d5SJeremy L Thompson           code << "      gradCollo3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix << ", r_s"
493*4b3e95d5SJeremy L Thompson                << var_suffix << ");\n";
494*4b3e95d5SJeremy L Thompson           break;
495*4b3e95d5SJeremy L Thompson         case CEED_EVAL_WEIGHT:
496*4b3e95d5SJeremy L Thompson           code << "      CeedScalar r_s" << var_suffix << "[1];\n";
497*4b3e95d5SJeremy L Thompson           code << "      r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
498*4b3e95d5SJeremy L Thompson           break;  // No action
499*4b3e95d5SJeremy L Thompson                   // LCOV_EXCL_START
500*4b3e95d5SJeremy L Thompson         case CEED_EVAL_DIV:
501*4b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
502*4b3e95d5SJeremy L Thompson         case CEED_EVAL_CURL:
503*4b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
504*4b3e95d5SJeremy L Thompson                   // LCOV_EXCL_STOP
505*4b3e95d5SJeremy L Thompson       }
506*4b3e95d5SJeremy L Thompson     }
507*4b3e95d5SJeremy L Thompson     code << "\n      // -- Output fields\n";
508*4b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
509*4b3e95d5SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
510*4b3e95d5SJeremy L Thompson 
511*4b3e95d5SJeremy L Thompson       code << "      // ---- Output field " << i << "\n";
512*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
513*4b3e95d5SJeremy L Thompson       // Basis action
514*4b3e95d5SJeremy L Thompson       switch (eval_mode) {
515*4b3e95d5SJeremy L Thompson         case CEED_EVAL_NONE:
516*4b3e95d5SJeremy L Thompson           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
517*4b3e95d5SJeremy L Thompson           break;  // No action
518*4b3e95d5SJeremy L Thompson         case CEED_EVAL_INTERP:
519*4b3e95d5SJeremy L Thompson           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
520*4b3e95d5SJeremy L Thompson           break;
521*4b3e95d5SJeremy L Thompson         case CEED_EVAL_GRAD:
522*4b3e95d5SJeremy L Thompson           code << "      CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n";
523*4b3e95d5SJeremy L Thompson           break;
524*4b3e95d5SJeremy L Thompson           // LCOV_EXCL_START
525*4b3e95d5SJeremy L Thompson         case CEED_EVAL_WEIGHT:
526*4b3e95d5SJeremy L Thompson           break;  // Should not occur
527*4b3e95d5SJeremy L Thompson         case CEED_EVAL_DIV:
528*4b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
529*4b3e95d5SJeremy L Thompson         case CEED_EVAL_CURL:
530*4b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
531*4b3e95d5SJeremy L Thompson                   // LCOV_EXCL_STOP
532*4b3e95d5SJeremy L Thompson       }
533*4b3e95d5SJeremy L Thompson     }
534*4b3e95d5SJeremy L Thompson   } else {
535*4b3e95d5SJeremy L Thompson     code << "\n    // Note: Using full elements\n";
536*4b3e95d5SJeremy L Thompson     code << "    {\n";
537*4b3e95d5SJeremy L Thompson     code << "      // -- Input fields\n";
538*4b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
539*4b3e95d5SJeremy L Thompson       code << "      // ---- Input field " << i << "\n";
540*4b3e95d5SJeremy L Thompson       code << "      CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
541*4b3e95d5SJeremy L Thompson     }
542*4b3e95d5SJeremy L Thompson     code << "      // -- Output fields\n";
543*4b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
544*4b3e95d5SJeremy L Thompson       code << "      // ---- Output field " << i << "\n";
545*4b3e95d5SJeremy L Thompson       code << "      CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
546*4b3e95d5SJeremy L Thompson     }
547*4b3e95d5SJeremy L Thompson   }
548*4b3e95d5SJeremy L Thompson 
549*4b3e95d5SJeremy L Thompson   // Input and output buffers
550*4b3e95d5SJeremy L Thompson   code << "\n      // -- QFunction inputs and outputs\n";
551*4b3e95d5SJeremy L Thompson   code << "      // ---- Inputs\n";
552*4b3e95d5SJeremy L Thompson   code << "      CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
553*4b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
554*4b3e95d5SJeremy L Thompson     code << "      // ------ Input field " << i << "\n";
555*4b3e95d5SJeremy L Thompson     code << "      inputs[" << i << "] = r_s_in_" << i << ";\n";
556*4b3e95d5SJeremy L Thompson   }
557*4b3e95d5SJeremy L Thompson   code << "      // ---- Outputs\n";
558*4b3e95d5SJeremy L Thompson   code << "      CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
559*4b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
560*4b3e95d5SJeremy L Thompson     code << "      // ------ Output field " << i << "\n";
561*4b3e95d5SJeremy L Thompson     code << "      outputs[" << i << "] = r_s_out_" << i << ";\n";
562*4b3e95d5SJeremy L Thompson   }
563*4b3e95d5SJeremy L Thompson 
564*4b3e95d5SJeremy L Thompson   // Apply QFunction
565*4b3e95d5SJeremy L Thompson   code << "\n      // -- Apply QFunction\n";
566*4b3e95d5SJeremy L Thompson   code << "      " << qfunction_name << "(ctx, ";
567*4b3e95d5SJeremy L Thompson   if (dim != 3 || use_3d_slices) {
568*4b3e95d5SJeremy L Thompson     code << "1";
569*4b3e95d5SJeremy L Thompson   } else {
570*4b3e95d5SJeremy L Thompson     code << "Q_1d";
571*4b3e95d5SJeremy L Thompson   }
572*4b3e95d5SJeremy L Thompson   code << ", inputs, outputs);\n";
573*4b3e95d5SJeremy L Thompson 
574*4b3e95d5SJeremy L Thompson   // Copy or apply transpose grad, if needed
575*4b3e95d5SJeremy L Thompson   if (use_3d_slices) {
576*4b3e95d5SJeremy L Thompson     code << "      // -- Output fields\n";
577*4b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
578*4b3e95d5SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
579*4b3e95d5SJeremy L Thompson       std::string P_name     = "P_1d" + var_suffix;
580*4b3e95d5SJeremy L Thompson 
581*4b3e95d5SJeremy L Thompson       code << "      // ---- Output field " << i << "\n";
582*4b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
583*4b3e95d5SJeremy L Thompson       // Basis action
584*4b3e95d5SJeremy L Thompson       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
585*4b3e95d5SJeremy L Thompson       switch (eval_mode) {
586*4b3e95d5SJeremy L Thompson         case CEED_EVAL_NONE:
587*4b3e95d5SJeremy L Thompson           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
588*4b3e95d5SJeremy L Thompson           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
589*4b3e95d5SJeremy L Thompson           code << "      }\n";
590*4b3e95d5SJeremy L Thompson           break;  // No action
591*4b3e95d5SJeremy L Thompson         case CEED_EVAL_INTERP:
592*4b3e95d5SJeremy L Thompson           code << "      for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
593*4b3e95d5SJeremy L Thompson           code << "        r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
594*4b3e95d5SJeremy L Thompson           code << "      }\n";
595*4b3e95d5SJeremy L Thompson           break;
596*4b3e95d5SJeremy L Thompson         case CEED_EVAL_GRAD:
597*4b3e95d5SJeremy L Thompson           code << "      gradColloTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G" << var_suffix
598*4b3e95d5SJeremy L Thompson                << ", r_q" << var_suffix << ");\n";
599*4b3e95d5SJeremy L Thompson           break;
600*4b3e95d5SJeremy L Thompson           // LCOV_EXCL_START
601*4b3e95d5SJeremy L Thompson         case CEED_EVAL_WEIGHT:
602*4b3e95d5SJeremy L Thompson           break;  // Should not occur
603*4b3e95d5SJeremy L Thompson         case CEED_EVAL_DIV:
604*4b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
605*4b3e95d5SJeremy L Thompson         case CEED_EVAL_CURL:
606*4b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
607*4b3e95d5SJeremy L Thompson                   // LCOV_EXCL_STOP
608*4b3e95d5SJeremy L Thompson       }
609*4b3e95d5SJeremy L Thompson     }
610*4b3e95d5SJeremy L Thompson   }
611*4b3e95d5SJeremy L Thompson   code << "    }\n";
612*4b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
613*4b3e95d5SJeremy L Thompson }
614*4b3e95d5SJeremy L Thompson 
615*4b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
616b2165e7aSSebastian Grimberg // Build single operator kernel
617ab213215SJeremy L Thompson //------------------------------------------------------------------------------
618eb7e6cafSJeremy L Thompson extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
619*4b3e95d5SJeremy L Thompson   bool                    is_tensor = true, use_3d_slices = false;
620241a4b83SYohann   Ceed                    ceed;
621*4b3e95d5SJeremy L Thompson   CeedInt                 Q_1d, num_input_fields, num_output_fields, dim = 1;
622ca735530SJeremy L Thompson   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
623ca735530SJeremy L Thompson   CeedQFunction_Cuda_gen *qf_data;
624ca735530SJeremy L Thompson   CeedQFunction           qf;
625ca735530SJeremy L Thompson   CeedOperatorField      *op_input_fields, *op_output_fields;
626ca735530SJeremy L Thompson   CeedOperator_Cuda_gen  *data;
627*4b3e95d5SJeremy L Thompson   std::ostringstream      code;
628*4b3e95d5SJeremy L Thompson 
629*4b3e95d5SJeremy L Thompson   {
630*4b3e95d5SJeremy L Thompson     bool is_setup_done;
631ca735530SJeremy L Thompson 
632ca735530SJeremy L Thompson     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
633ca735530SJeremy L Thompson     if (is_setup_done) return CEED_ERROR_SUCCESS;
634*4b3e95d5SJeremy L Thompson   }
635ca735530SJeremy L Thompson 
636ca735530SJeremy L Thompson   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
637ca735530SJeremy L Thompson   CeedCallBackend(CeedOperatorGetData(op, &data));
638ca735530SJeremy L Thompson   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
639ca735530SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
640ca735530SJeremy L Thompson   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
641ca735530SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
642241a4b83SYohann 
643*4b3e95d5SJeremy L Thompson   // Get operator data
644*4b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
645*4b3e95d5SJeremy L Thompson                                                        qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
646*4b3e95d5SJeremy L Thompson   if (dim == 0) dim = 1;
647*4b3e95d5SJeremy L Thompson   data->dim = dim;
648*4b3e95d5SJeremy L Thompson   if (Q_1d == 0) {
649*4b3e95d5SJeremy L Thompson     CeedInt Q;
650*4b3e95d5SJeremy L Thompson 
651*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
652*4b3e95d5SJeremy L Thompson     Q_1d = Q;
653*4b3e95d5SJeremy L Thompson   }
654*4b3e95d5SJeremy L Thompson   data->Q_1d = Q_1d;
655*4b3e95d5SJeremy L Thompson 
6560b454692Sjeremylt   // Check for restriction only identity operator
657*4b3e95d5SJeremy L Thompson   {
658*4b3e95d5SJeremy L Thompson     bool is_identity_qf;
659*4b3e95d5SJeremy L Thompson 
6602b730f8bSJeremy L Thompson     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
6610b454692Sjeremylt     if (is_identity_qf) {
6629e201c85SYohann       CeedEvalMode eval_mode_in, eval_mode_out;
663ca735530SJeremy L Thompson 
6642b730f8bSJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
6652b730f8bSJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
6666574a04fSJeremy L Thompson       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
6676574a04fSJeremy L Thompson                 "Backend does not implement restriction only identity operators");
6680b454692Sjeremylt     }
669*4b3e95d5SJeremy L Thompson   }
6700b454692Sjeremylt 
671f1a13f77SYohann Dudouit   // Add atomicAdd function for old NVidia architectures
672*4b3e95d5SJeremy L Thompson   {
673*4b3e95d5SJeremy L Thompson     Ceed_Cuda            *ceed_data;
674*4b3e95d5SJeremy L Thompson     struct cudaDeviceProp prop;
675*4b3e95d5SJeremy L Thompson 
6762b730f8bSJeremy L Thompson     CeedCallBackend(CeedGetData(ceed, &ceed_data));
6772b730f8bSJeremy L Thompson     CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id));
67880a9ef05SNatalie Beams     if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) {
67922070f95SJeremy L Thompson       char       *atomic_add_source;
68022070f95SJeremy L Thompson       const char *atomic_add_path;
681ca735530SJeremy L Thompson 
6822b730f8bSJeremy L Thompson       CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", &atomic_add_path));
68323d4529eSJeremy L Thompson       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Atomic Add Source -----\n");
6842b730f8bSJeremy L Thompson       CeedCallBackend(CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source));
6859e201c85SYohann       code << atomic_add_source;
6862b730f8bSJeremy L Thompson       CeedCallBackend(CeedFree(&atomic_add_path));
6872b730f8bSJeremy L Thompson       CeedCallBackend(CeedFree(&atomic_add_source));
688f1a13f77SYohann Dudouit     }
689*4b3e95d5SJeremy L Thompson   }
690f1a13f77SYohann Dudouit 
6919e201c85SYohann   // Load basis source files
692*4b3e95d5SJeremy L Thompson   // TODO: Add non-tensor, AtPoints
6939e201c85SYohann   {
69422070f95SJeremy L Thompson     char       *tensor_basis_kernel_source;
69522070f95SJeremy L Thompson     const char *tensor_basis_kernel_path;
696ca735530SJeremy L Thompson 
6972b730f8bSJeremy L Thompson     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
69823d4529eSJeremy L Thompson     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Tensor Basis Kernel Source -----\n");
6992b730f8bSJeremy L Thompson     CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source));
7009e201c85SYohann     code << tensor_basis_kernel_source;
7012b730f8bSJeremy L Thompson     CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
7022b730f8bSJeremy L Thompson     CeedCallBackend(CeedFree(&tensor_basis_kernel_source));
7039e201c85SYohann   }
7049e201c85SYohann   {
70522070f95SJeremy L Thompson     char       *cuda_gen_template_source;
70622070f95SJeremy L Thompson     const char *cuda_gen_template_path;
707ca735530SJeremy L Thompson 
7082b730f8bSJeremy L Thompson     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/cuda/cuda-gen-templates.h", &cuda_gen_template_path));
70923d4529eSJeremy L Thompson     CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "----- Loading Cuda-Gen Template Source -----\n");
7102b730f8bSJeremy L Thompson     CeedCallBackend(CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source));
7119e201c85SYohann     code << cuda_gen_template_source;
7122b730f8bSJeremy L Thompson     CeedCallBackend(CeedFree(&cuda_gen_template_path));
7132b730f8bSJeremy L Thompson     CeedCallBackend(CeedFree(&cuda_gen_template_source));
7149e201c85SYohann   }
715241a4b83SYohann 
716*4b3e95d5SJeremy L Thompson   // Get QFunction name
717*4b3e95d5SJeremy L Thompson   std::string qfunction_name(qf_data->qfunction_name);
718*4b3e95d5SJeremy L Thompson   std::string operator_name;
719*4b3e95d5SJeremy L Thompson 
72009095acaSJeremy L Thompson   operator_name = "CeedKernelCudaGenOperator_" + qfunction_name;
7214d537eeaSYohann 
7229e201c85SYohann   // Define CEED_Q_VLA
7239e201c85SYohann   code << "\n#undef CEED_Q_VLA\n";
724*4b3e95d5SJeremy L Thompson   if (dim != 3 || use_3d_slices) {
7259e201c85SYohann     code << "#define CEED_Q_VLA 1\n\n";
7269e201c85SYohann   } else {
7279e201c85SYohann     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
7289e201c85SYohann   }
7299e201c85SYohann 
730*4b3e95d5SJeremy L Thompson   // Add user QFunction source
731*4b3e95d5SJeremy L Thompson   {
732*4b3e95d5SJeremy L Thompson     std::string qfunction_source(qf_data->qfunction_source);
733*4b3e95d5SJeremy L Thompson 
73409095acaSJeremy L Thompson     code << qfunction_source;
735*4b3e95d5SJeremy L Thompson   }
736241a4b83SYohann 
737241a4b83SYohann   // Setup
738818e0025SJeremy L Thompson   code << "\n// -----------------------------------------------------------------------------\n";
739*4b3e95d5SJeremy L Thompson   code << "// Operator Kernel\n";
740*4b3e95d5SJeremy L Thompson   code << "// \n";
741*4b3e95d5SJeremy L Thompson   code << "// d_[in,out]_i:   CeedVector device array\n";
742*4b3e95d5SJeremy L Thompson   code << "// r_[in,out]_e_i: Element vector register\n";
743*4b3e95d5SJeremy L Thompson   code << "// r_[in,out]_q_i: Quadrature space vector register\n";
744*4b3e95d5SJeremy L Thompson   code << "// r_[in,out]_s_i: Quadrature space slice  vector register\n";
745*4b3e95d5SJeremy L Thompson   code << "// \n";
746*4b3e95d5SJeremy L Thompson   code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
747*4b3e95d5SJeremy L Thompson   code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
748*4b3e95d5SJeremy L Thompson   code << "// -----------------------------------------------------------------------------\n";
749*4b3e95d5SJeremy L Thompson   code << "extern \"C\" __global__ void " << operator_name
7502b730f8bSJeremy L Thompson        << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {\n";
751*4b3e95d5SJeremy L Thompson 
752*4b3e95d5SJeremy L Thompson   // Scratch buffers
7539e201c85SYohann   for (CeedInt i = 0; i < num_input_fields; i++) {
754*4b3e95d5SJeremy L Thompson     CeedEvalMode eval_mode;
755*4b3e95d5SJeremy L Thompson 
7562b730f8bSJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
7579e201c85SYohann     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
758*4b3e95d5SJeremy L Thompson       code << "  const CeedScalar *d_in_" << i << " = fields.inputs[" << i << "];\n";
759241a4b83SYohann     }
760241a4b83SYohann   }
7619e201c85SYohann   for (CeedInt i = 0; i < num_output_fields; i++) {
762*4b3e95d5SJeremy L Thompson     code << "  CeedScalar *d_out_" << i << " = fields.outputs[" << i << "];\n";
763241a4b83SYohann   }
7641da99368SJeremy L Thompson 
7659e201c85SYohann   code << "  const CeedInt dim = " << dim << ";\n";
7669e201c85SYohann   code << "  const CeedInt Q_1d = " << Q_1d << ";\n";
7671da99368SJeremy L Thompson 
768*4b3e95d5SJeremy L Thompson   // Shared data
769241a4b83SYohann   code << "  extern __shared__ CeedScalar slice[];\n";
7709e201c85SYohann   code << "  SharedData_Cuda data;\n";
7719e201c85SYohann   code << "  data.t_id_x = threadIdx.x;\n";
7729e201c85SYohann   code << "  data.t_id_y = threadIdx.y;\n";
7739e201c85SYohann   code << "  data.t_id_z = threadIdx.z;\n";
7749e201c85SYohann   code << "  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
7759e201c85SYohann   code << "  data.slice = slice + data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
776920dcdc4Sjeremylt 
777ac421f39SYohann   // Initialize constants, and matrices B and G
778*4b3e95d5SJeremy L Thompson   code << "\n  // Input field constants and basis data\n";
7799e201c85SYohann   for (CeedInt i = 0; i < num_input_fields; i++) {
780*4b3e95d5SJeremy L Thompson     CeedCall(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
781920dcdc4Sjeremylt   }
782*4b3e95d5SJeremy L Thompson   code << "\n  // Output field constants and basis data\n";
7839e201c85SYohann   for (CeedInt i = 0; i < num_output_fields; i++) {
784*4b3e95d5SJeremy L Thompson     CeedCall(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
785*4b3e95d5SJeremy L Thompson   }
786920dcdc4Sjeremylt 
787*4b3e95d5SJeremy L Thompson   // Loop over all elements
788*4b3e95d5SJeremy L Thompson   code << "\n  // Element loop\n";
789ac421f39SYohann   code << "  __syncthreads();\n";
7909e201c85SYohann   code << "  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
791*4b3e95d5SJeremy L Thompson 
792*4b3e95d5SJeremy L Thompson   // -- Input restriction and basis
793*4b3e95d5SJeremy L Thompson   code << "    // -- Input field restrictions and basis actions\n";
7949e201c85SYohann   for (CeedInt i = 0; i < num_input_fields; i++) {
795*4b3e95d5SJeremy L Thompson     code << "    // ---- Input field " << i << "\n";
796920dcdc4Sjeremylt 
797*4b3e95d5SJeremy L Thompson     // ---- Restriction
798*4b3e95d5SJeremy L Thompson     CeedCallBackend(
799*4b3e95d5SJeremy L Thompson         CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
8009a2291e3SJeremy L Thompson 
801*4b3e95d5SJeremy L Thompson     // ---- Basis action
802*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
803920dcdc4Sjeremylt   }
804920dcdc4Sjeremylt 
805*4b3e95d5SJeremy L Thompson   // -- Q function
806*4b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
807*4b3e95d5SJeremy L Thompson                                                             op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices));
808ca735530SJeremy L Thompson 
809*4b3e95d5SJeremy L Thompson   // -- Output basis and restriction
810*4b3e95d5SJeremy L Thompson   code << "\n    // -- Output field basis action and restrictions\n";
8119e201c85SYohann   for (CeedInt i = 0; i < num_output_fields; i++) {
812*4b3e95d5SJeremy L Thompson     code << "    // ---- Output field " << i << "\n";
813ca735530SJeremy L Thompson 
814*4b3e95d5SJeremy L Thompson     // ---- Basis action
815*4b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
8169a2291e3SJeremy L Thompson 
817*4b3e95d5SJeremy L Thompson     // ---- Restriction
818*4b3e95d5SJeremy L Thompson     CeedCallBackend(
819*4b3e95d5SJeremy L Thompson         CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
820ac421f39SYohann   }
821241a4b83SYohann 
822*4b3e95d5SJeremy L Thompson   // Close loop and function
823241a4b83SYohann   code << "  }\n";
824818e0025SJeremy L Thompson   code << "}\n";
825818e0025SJeremy L Thompson   code << "// -----------------------------------------------------------------------------\n\n";
826241a4b83SYohann 
827ab213215SJeremy L Thompson   // View kernel for debugging
82823d4529eSJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "Generated Operator Kernels:\n");
8293f21f6b1SJeremy L Thompson   CeedDebug(ceed, code.str().c_str());
830241a4b83SYohann 
831eb7e6cafSJeremy L Thompson   CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
832eb7e6cafSJeremy L Thompson   CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
833241a4b83SYohann 
8342b730f8bSJeremy L Thompson   CeedCallBackend(CeedOperatorSetSetupDone(op));
835e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
836241a4b83SYohann }
8372a86cc9dSSebastian Grimberg 
838ab213215SJeremy L Thompson //------------------------------------------------------------------------------
839