xref: /libCEED/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp (revision e13f2367fe6909fff484f5bc0bdefbfcf8ffe552)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #define CEED_DEBUG_COLOR 12
9 
10 #include <ceed/backend.h>
11 #include <ceed/ceed.h>
12 #include <ceed/jit-source/sycl/sycl-types.h>
13 #include <ceed/jit-tools.h>
14 
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 #include <string_view>
19 #include <vector>
20 
21 #include "../sycl-ref/ceed-sycl-ref.hpp"
22 #include "../sycl-shared/ceed-sycl-shared.hpp"
23 #include "../sycl/ceed-sycl-compile.hpp"
24 
25 #include "ceed-sycl-gen.hpp"
26 
27 //------------------------------------------------------------------------------
28 // Calculate the block size used for launching the operator kernel
29 //------------------------------------------------------------------------------
30 extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
31   const CeedInt thread1d = CeedIntMax(Q_1d, P_1d);
32 
33   if (dim == 1) {
34     CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64;
35 
36     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
37     block_sizes[0]  = thread1d;
38     block_sizes[1]  = 1;
39     block_sizes[2]  = elems_per_block;
40   } else if (dim == 2) {
41     const CeedInt elems_per_block = thread1d < 4 ? 16 : 2;
42 
43     block_sizes[0] = thread1d;
44     block_sizes[1] = thread1d;
45     block_sizes[2] = elems_per_block;
46   } else if (dim == 3) {
47     const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1);
48 
49     block_sizes[0] = thread1d;
50     block_sizes[1] = thread1d;
51     block_sizes[2] = elems_per_block;
52   }
53   return CEED_ERROR_SUCCESS;
54 }
55 
56 //------------------------------------------------------------------------------
57 // Build single operator kernel
58 // - [ ] Check arguments to device functions reudsed from sycl-shared-basis are correct
59 // - [ ] Do kernel jitting!
60 //------------------------------------------------------------------------------
61 extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
62   Ceed                      ceed;
63   Ceed_Sycl                *sycl_data;
64   bool                      is_setup_done, is_identity_qf;
65   CeedSize                  l_size;
66   CeedInt                   Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1;
67   Fields_Sycl               h_B, h_G;
68   FieldsInt_Sycl            h_indices;
69   CeedEvalMode              eval_mode;
70   CeedElemRestriction       elem_rstr;
71   CeedElemRestriction_Sycl *rstr_impl;
72   CeedBasis                 basis;
73   CeedBasis_Sycl_shared    *basis_impl;
74   CeedQFunctionField       *qf_input_fields, *qf_output_fields;
75   CeedQFunction_Sycl_gen   *qf_impl;
76   CeedQFunction             qf;
77   CeedOperatorField        *op_input_fields, *op_output_fields;
78   CeedOperator_Sycl_gen    *impl;
79 
80   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
81   if (is_setup_done) return CEED_ERROR_SUCCESS;
82 
83   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
84   CeedCallBackend(CeedGetData(ceed, &sycl_data));
85 
86   CeedCallBackend(CeedOperatorGetData(op, &impl));
87   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
88   CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl));
89   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
90   Q_1d = Q;
91 
92   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
93   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
94 
95   // Check for restriction only identity operator
96   CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
97   if (is_identity_qf) {
98     CeedEvalMode eval_mode_in, eval_mode_out;
99 
100     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
101     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
102     CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
103               "Backend does not implement restriction only identity operators");
104   }
105 
106   std::ostringstream code;
107   // TODO: generalize to accept different device functions?
108   {
109     char       *tensor_basis_code;
110     const char *tensor_basis_kernel_path;
111 
112     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
113     CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n");
114     CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_code));
115     code << tensor_basis_code;
116     CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
117     CeedCallBackend(CeedFree(&tensor_basis_code));
118   }
119   {
120     char       *sycl_gen_template_source;
121     const char *sycl_gen_template_path;
122 
123     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-gen-templates.h", &sycl_gen_template_path));
124     CeedDebug256(ceed, 2, "----- Loading Sycl-Gen Template Source -----\n");
125     CeedCallBackend(CeedLoadSourceToBuffer(ceed, sycl_gen_template_path, &sycl_gen_template_source));
126     code << sycl_gen_template_source;
127     CeedCallBackend(CeedFree(&sycl_gen_template_path));
128     CeedCallBackend(CeedFree(&sycl_gen_template_source));
129   }
130 
131   std::string_view  qfunction_source(qf_impl->qfunction_source);
132   std::string_view  qfunction_name(qf_impl->qfunction_name);
133   const std::string operator_name = "CeedKernelSyclGenOperator_" + std::string(qfunction_name);
134 
135   // Find dim, P_1d, Q_1d
136   impl->max_P_1d = 0;
137   for (CeedInt i = 0; i < num_input_fields; i++) {
138     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
139     if (basis != CEED_BASIS_NONE) {
140       bool is_tensor;
141 
142       CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
143       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
144 
145       // Collect dim, P_1d, and Q_1d
146       CeedCallBackend(CeedBasisGetDimension(basis, &dim));
147       CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
148       if (is_tensor) {
149         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
150         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
151         if (P_1d > impl->max_P_1d) impl->max_P_1d = P_1d;
152       } else {
153         // LCOV_EXCL_START
154         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
155         // LCOV_EXCL_STOP
156       }
157     }
158   }
159   // Check output bases for Q_1d, dim as well
160   //   The only input basis might be CEED_BASIS_NONE
161   for (CeedInt i = 0; i < num_output_fields; i++) {
162     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
163 
164     if (basis != CEED_BASIS_NONE) {
165       bool is_tensor;
166 
167       CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
168       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
169 
170       // Collect Q_1d
171       CeedCallBackend(CeedBasisGetDimension(basis, &dim));
172       CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
173       if (is_tensor) {
174         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
175       } else {
176         // LCOV_EXCL_START
177         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
178         // LCOV_EXCL_STOP
179       }
180     }
181   }
182   impl->dim  = dim;
183   impl->Q_1d = Q_1d;
184 
185   // Only use 3D collocated gradient parallelization strategy when gradient is computed
186   // TODO: put in a function?
187   bool use_collograd_parallelization = false;
188 
189   if (dim == 3) {
190     bool was_grad_found = false;
191 
192     for (CeedInt i = 0; i < num_input_fields; i++) {
193       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
194       if (eval_mode == CEED_EVAL_GRAD) {
195         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
196         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
197         use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
198         was_grad_found                = true;
199       }
200     }
201     for (CeedInt i = 0; i < num_output_fields; i++) {
202       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
203       if (eval_mode == CEED_EVAL_GRAD) {
204         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
205         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
206         use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
207         was_grad_found                = true;
208       }
209     }
210   }
211 
212   CeedInt block_sizes[3];
213   CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes));
214 
215   // Define CEED_Q_VLA
216   code << "\n#undef CEED_Q_VLA\n";
217   if (dim != 3 || use_collograd_parallelization) {
218     code << "#define CEED_Q_VLA 1\n\n";
219   } else {
220     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
221   }
222 
223   // Determine subgroup size based on supported sizes : Default : 16 (if supported)
224   std::vector allowed_sg_sizes  = sycl_data->sycl_device.get_info<sycl::info::device::sub_group_sizes>();
225   CeedInt     sub_group_size_op = allowed_sg_sizes[allowed_sg_sizes.size() - 1];
226   for (const auto &s : allowed_sg_sizes) {
227     if (s == 16) {
228       sub_group_size_op = s;
229       break;
230     }
231   }
232 
233   code << qfunction_source;
234 
235   // Kernel function
236   code << "\n// -----------------------------------------------------------------------------\n";
237   code << "__attribute__((reqd_work_group_size(GROUP_SIZE_X, GROUP_SIZE_Y, GROUP_SIZE_Z), intel_reqd_sub_group_size(" << sub_group_size_op << ")))\n";
238   code << "kernel void " << operator_name << "(";
239   code << "const CeedInt num_elem, ";
240   code << "global void* ctx, ";
241   code << "global const FieldsInt_Sycl* indices, ";
242   code << "global Fields_Sycl* fields, ";
243   code << "global const Fields_Sycl* B, ";
244   code << "global const Fields_Sycl* G, ";
245   code << "global const CeedScalar * restrict W";
246   code << ") {\n";
247 
248   for (CeedInt i = 0; i < num_input_fields; i++) {
249     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
250     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
251       code << "  global const CeedScalar* d_u_" << i << " = fields->inputs[" << i << "];\n";
252     }
253   }
254 
255   for (CeedInt i = 0; i < num_output_fields; i++) {
256     code << "  global CeedScalar* d_v_" << i << " = fields->outputs[" << i << "];\n";
257   }
258 
259   // TODO: Convert these to defined constants to save on GRF
260   code << "  const CeedInt DIM = " << dim << ";\n";
261   code << "  const CeedInt Q_1D = " << Q_1d << ";\n";
262 
263   const CeedInt scratch_size = block_sizes[0] * block_sizes[1] * block_sizes[2];
264   code << "  local CeedScalar scratch[" << scratch_size << "];\n";
265   code << "  local CeedScalar * elem_scratch = scratch + get_local_id(2) * T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
266 
267   code << "\n  // -- Input field constants and basis data --\n";
268   // Initialize constants, and matrices B and G
269   for (CeedInt i = 0; i < num_input_fields; i++) {
270     code << "  // ---- Input field " << i << " ----\n";
271     // Get elem_size, eval_mode, num_comp
272     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
273     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
274     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
275     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
276 
277     // Set field constants
278     if (eval_mode != CEED_EVAL_WEIGHT) {
279       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
280       if (basis != CEED_BASIS_NONE) {
281         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
282         code << "  const CeedInt P_in_" << i << " = " << P_1d << ";\n";
283       } else {
284         code << "  const CeedInt P_in_" << i << " = " << Q_1d << ";\n";
285       }
286       code << "  const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n";
287     }
288 
289     // Load basis data
290     code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
291     switch (eval_mode) {
292       case CEED_EVAL_NONE:
293         break;
294       case CEED_EVAL_INTERP:
295         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
296         h_B.inputs[i] = basis_impl->d_interp_1d;
297         code << "  local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
298         code << "  loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n";
299         break;
300       case CEED_EVAL_GRAD:
301         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
302         h_B.inputs[i] = basis_impl->d_interp_1d;
303         code << "  local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
304         code << "  loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n";
305         if (use_collograd_parallelization) {
306           h_G.inputs[i] = basis_impl->d_collo_grad_1d;
307           code << "  local CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n";
308           code << "  loadMatrix(Q_1D*Q_1D, G->inputs[" << i << "], s_G_in_" << i << ");\n";
309         } else {
310           bool has_collo_grad = basis_impl->d_collo_grad_1d;
311           h_G.inputs[i]       = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d;
312           code << "  local CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
313           code << "  loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_in_" + std::to_string(i))) << "*Q_1D, G->inputs[" << i << "], s_G_in_" << i
314                << ");\n";
315         }
316         break;
317       case CEED_EVAL_WEIGHT:
318         break;  // No action
319       case CEED_EVAL_DIV:
320         break;  // TODO: Not implemented
321       case CEED_EVAL_CURL:
322         break;  // TODO: Not implemented
323     }
324   }
325 
326   code << "\n  // -- Output field constants and basis data --\n";
327   for (CeedInt i = 0; i < num_output_fields; i++) {
328     code << "  // ---- Output field " << i << " ----\n";
329     // Get elem_size, eval_mode, num_comp
330     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
331     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
332     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
333     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
334 
335     // Set field constants
336     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
337     if (basis != CEED_BASIS_NONE) {
338       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
339       code << "  const CeedInt P_out_" << i << " = " << P_1d << ";\n";
340     } else {
341       code << "  const CeedInt P_out_" << i << " = " << Q_1d << ";\n";
342     }
343     code << "  const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n";
344 
345     // Load basis data
346     code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
347     switch (eval_mode) {
348       case CEED_EVAL_NONE:
349         break;  // No action
350       case CEED_EVAL_INTERP:
351         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
352         h_B.outputs[i] = basis_impl->d_interp_1d;
353         code << "  local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
354         code << "  loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n";
355         break;
356       case CEED_EVAL_GRAD:
357         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
358         h_B.outputs[i] = basis_impl->d_interp_1d;
359         code << "  local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
360         code << "  loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n";
361         if (use_collograd_parallelization) {
362           h_G.outputs[i] = basis_impl->d_collo_grad_1d;
363           code << "  local CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n";
364           code << "  loadMatrix(Q_1D*Q_1D, G->outputs[" << i << "], s_G_out_" << i << ");\n";
365         } else {
366           bool has_collo_grad = basis_impl->d_collo_grad_1d;
367           h_G.outputs[i]      = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d;
368           code << "  local CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
369           code << "  loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_out_" + std::to_string(i))) << "*Q_1D, G->outputs[" << i << "], s_G_out_" << i
370                << ");\n";
371         }
372         break;
373       // LCOV_EXCL_START
374       case CEED_EVAL_WEIGHT: {
375         Ceed ceed;
376 
377         CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
378         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
379         break;  // Should not occur
380       }
381       case CEED_EVAL_DIV:
382       case CEED_EVAL_CURL: {
383         Ceed ceed;
384 
385         CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
386         return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
387         break;  // Should not occur
388       }
389         // LCOV_EXCL_STOP
390     }
391   }
392   code << "\n  // -- Element loop --\n";
393   code << "  work_group_barrier(CLK_LOCAL_MEM_FENCE);\n";
394   code << "  {\n";
395   // Input basis apply if needed
396   // Generate the correct eval mode code for each input
397   code << "    // -- Input field restrictions and basis actions --\n";
398   for (CeedInt i = 0; i < num_input_fields; i++) {
399     code << "    // ---- Input field " << i << " ----\n";
400     // Get elem_size, eval_mode, num_comp
401     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
402     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
403     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
404     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
405 
406     // Restriction
407     if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) {
408       bool is_strided;
409 
410       code << "    CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n";
411 
412       CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
413       if (!is_strided) {
414         CeedInt comp_stride;
415 
416         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
417         code << "    const CeedInt l_size_in_" << i << " = " << l_size << ";\n";
418         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
419         code << "    // CompStride: " << comp_stride << "\n";
420         CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl));
421         h_indices.inputs[i] = rstr_impl->d_ind;
422         code << "    readDofsOffset" << dim << "d(num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ", num_elem, indices->inputs[" << i
423              << "], d_u_" << i << ", r_u_" << i << ");\n";
424       } else {
425         bool    has_backend_strides;
426         CeedInt num_elem;
427 
428         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
429         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
430         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
431 
432         if (!has_backend_strides) {
433           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
434         }
435         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
436         code << "    readDofsStrided" << dim << "d(num_comp_in_" << i << ",P_in_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
437              << ", num_elem, d_u_" << i << ", r_u_" << i << ");\n";
438       }
439     }
440 
441     // Basis action
442     code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
443     switch (eval_mode) {
444       case CEED_EVAL_NONE:
445         if (!use_collograd_parallelization) {
446           code << "    private CeedScalar* r_t_" << i << " = r_u_" << i << ";\n";
447         }
448         break;
449       case CEED_EVAL_INTERP:
450         code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n";
451         code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" << i
452              << ", r_t_" << i << ", elem_scratch);\n";
453         break;
454       case CEED_EVAL_GRAD:
455         if (use_collograd_parallelization) {
456           code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n";
457           code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_"
458                << i << ", r_t_" << i << ", elem_scratch);\n";
459         } else {
460           CeedInt P_1d;
461           CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
462           CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
463           code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM*Q_1D];\n";
464           code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_in_" << i
465                << ", P_in_" << i << ", Q_1D, r_u_" << i << (dim > 1 ? ", s_B_in_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_in_" << i
466                << ", r_t_" << i << ", elem_scratch);\n";
467         }
468         break;
469       case CEED_EVAL_WEIGHT:
470         code << "    CeedScalar r_t_" << i << "[Q_1D];\n";
471         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
472         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
473         impl->W = basis_impl->d_q_weight_1d;
474         code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(Q_1D, W, r_t_" << i << ");\n";
475         break;  // No action
476       case CEED_EVAL_DIV:
477         break;  // TODO: Not implemented
478       case CEED_EVAL_CURL:
479         break;  // TODO: Not implemented
480     }
481   }
482 
483   // Q function
484   code << "\n    // -- Output field setup --\n";
485   for (CeedInt i = 0; i < num_output_fields; i++) {
486     code << "\n    // ---- Output field " << i << " ----\n";
487     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
488     if (eval_mode == CEED_EVAL_GRAD) {
489       if (use_collograd_parallelization) {
490         // Accumulator for gradient slices
491         code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n";
492         code << "    for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n";
493         code << "      for (CeedInt j = 0; j < Q_1D; ++j) {\n";
494         code << "        r_tt_" << i << "[j + i*Q_1D] = 0.0;\n";
495         code << "      }\n";
496         code << "    }\n";
497       } else {
498         code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*DIM*Q_1D];\n";
499       }
500     }
501     if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
502       code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n";
503     }
504   }
505   // We treat quadrature points per slice in 3d to save registers
506   if (use_collograd_parallelization) {
507     code << "\n    // Note: Using planes of 3D elements\n";
508     code << "    for (CeedInt q = 0; q < Q_1D; q++) {\n";
509     code << "      // -- Input fields --\n";
510     for (CeedInt i = 0; i < num_input_fields; i++) {
511       code << "      // ---- Input field " << i << " ----\n";
512       // Get elem_size, eval_mode, num_comp
513       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
514       // Basis action
515       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
516       switch (eval_mode) {
517         case CEED_EVAL_NONE:
518           bool is_strided;
519 
520           code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
521 
522           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
523           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
524           if (!is_strided) {
525             CeedInt comp_stride;
526 
527             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
528             code << "      const CeedInt l_size_in_" << i << " = " << l_size << ";\n";
529             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
530             code << "      // CompStride: " << comp_stride << "\n";
531             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl));
532             h_indices.inputs[i] = rstr_impl->d_ind;
533             code << "      readSliceQuadsOffset"
534                  << "3d(num_comp_in_" << i << ", " << comp_stride << ", Q_1D, l_size_in_" << i << ", num_elem, q, indices->inputs[" << i << "], d_u_"
535                  << i << ", r_q_" << i << ");\n";
536           } else {
537             bool    has_backend_strides;
538             CeedInt num_elem;
539 
540             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
541             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
542             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
543             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
544 
545             if (!has_backend_strides) {
546               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
547             }
548             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
549             code << "      readSliceQuadsStrided"
550                  << "3d(num_comp_in_" << i << ", Q_1D," << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i
551                  << ", r_q_" << i << ");\n";
552           }
553           break;
554         case CEED_EVAL_INTERP:
555           code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
556           code << "      for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n";
557           code << "        r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1D];\n";
558           code << "      }\n";
559           break;
560         case CEED_EVAL_GRAD:
561           code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "*DIM];\n";
562           code << "      gradCollo3d(num_comp_in_" << i << ", Q_1D, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ", elem_scratch);\n";
563           break;
564         case CEED_EVAL_WEIGHT:
565           code << "      CeedScalar r_q_" << i << "[1];\n";
566           code << "      r_q_" << i << "[0] = r_t_" << i << "[q];\n";
567           break;  // No action
568         case CEED_EVAL_DIV:
569           break;  // TODO: Not implemented
570         case CEED_EVAL_CURL:
571           break;  // TODO: Not implemented
572       }
573     }
574     code << "\n      // -- Output fields --\n";
575     for (CeedInt i = 0; i < num_output_fields; i++) {
576       code << "      // ---- Output field " << i << " ----\n";
577       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
578       // Basis action
579       switch (eval_mode) {
580         case CEED_EVAL_NONE:
581           code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
582           break;  // No action
583         case CEED_EVAL_INTERP:
584           code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
585           break;
586         case CEED_EVAL_GRAD:
587           code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*DIM];\n";
588           break;
589         case CEED_EVAL_WEIGHT:
590           break;  // Should not occur
591         case CEED_EVAL_DIV:
592           break;  // TODO: Not implemented
593         case CEED_EVAL_CURL:
594           break;  // TODO: Not implemented
595       }
596     }
597   } else {
598     code << "\n      // Note: Using full elements\n";
599     code << "      // -- Input fields --\n";
600     for (CeedInt i = 0; i < num_input_fields; i++) {
601       code << "      // ---- Input field " << i << " ----\n";
602       code << "      private CeedScalar* r_q_" << i << " = r_t_" << i << ";\n";
603     }
604     code << "      // -- Output fields --\n";
605     for (CeedInt i = 0; i < num_output_fields; i++) {
606       code << "      // ---- Output field " << i << " ----\n";
607       code << "      private CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n";
608     }
609   }
610   //--------------------------------------------------
611   code << "\n      // -- QFunction Inputs and outputs --\n";
612   code << "      const CeedScalar * in[" << num_input_fields << "];\n";
613   for (CeedInt i = 0; i < num_input_fields; i++) {
614     code << "      // ---- Input field " << i << " ----\n";
615     code << "      in[" << i << "] = r_q_" << i << ";\n";
616   }
617   code << "      CeedScalar * out[" << num_output_fields << "];\n";
618   for (CeedInt i = 0; i < num_output_fields; i++) {
619     code << "      // ---- Output field " << i << " ----\n";
620     code << "      out[" << i << "] = r_qq_" << i << ";\n";
621   }
622 
623   code << "\n      // -- Apply QFunction --\n";
624   code << "      " << qfunction_name << "(ctx, ";
625   if (dim != 3 || use_collograd_parallelization) {
626     code << "1";
627   } else {
628     code << "Q_1D";
629   }
630   code << ", in, out);\n";
631   //--------------------------------------------------
632 
633   if (use_collograd_parallelization) {
634     code << "      // -- Output fields --\n";
635     for (CeedInt i = 0; i < num_output_fields; i++) {
636       code << "      // ---- Output field " << i << " ----\n";
637       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
638       // Basis action
639       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
640       switch (eval_mode) {
641         case CEED_EVAL_NONE:
642           code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
643           code << "        r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n";
644           code << "      }\n";
645           break;  // No action
646         case CEED_EVAL_INTERP:
647           code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
648           code << "        r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n";
649           code << "      }\n";
650           break;
651         case CEED_EVAL_GRAD:
652           code << "      gradColloTranspose3d(num_comp_out_" << i << ",Q_1D, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i
653                << ", elem_scratch);\n";
654           break;
655         case CEED_EVAL_WEIGHT:
656           break;  // Should not occur
657         case CEED_EVAL_DIV:
658           break;  // TODO: Not implemented
659         case CEED_EVAL_CURL:
660           break;  // TODO: Not implemented
661       }
662     }
663     code << "    }\n";
664   }
665 
666   // Output basis apply if needed
667   // Generate the correct eval mode code for each output
668   code << "\n    // -- Output field basis action and restrictions --\n";
669   for (CeedInt i = 0; i < num_output_fields; i++) {
670     code << "    // ---- Output field " << i << " ----\n";
671     // Get elem_size, eval_mode, num_comp
672     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
673     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
674     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
675     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
676     // Basis action
677     code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
678     switch (eval_mode) {
679       case CEED_EVAL_NONE:
680         code << "    private CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n";
681         break;  // No action
682       case CEED_EVAL_INTERP:
683         code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
684         code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i
685              << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n";
686         break;
687       case CEED_EVAL_GRAD:
688         code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
689         if (use_collograd_parallelization) {
690           code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i
691                << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n";
692         } else {
693           CeedInt P_1d;
694           CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
695           CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
696           code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_out_" << i
697                << ", P_out_" << i << ", Q_1D, r_tt_" << i << (dim > 1 ? ", s_B_out_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_out_" << i
698                << ", r_v_" << i << ", elem_scratch);\n";
699         }
700         break;
701       // LCOV_EXCL_START
702       case CEED_EVAL_WEIGHT: {
703         Ceed ceed;
704 
705         CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
706         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
707         break;  // Should not occur
708       }
709       case CEED_EVAL_DIV:
710       case CEED_EVAL_CURL: {
711         Ceed ceed;
712 
713         CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
714         return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
715         break;  // Should not occur
716       }
717         // LCOV_EXCL_STOP
718     }
719     // Restriction
720     bool is_strided;
721 
722     CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
723     if (!is_strided) {
724       CeedInt comp_stride;
725 
726       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
727       code << "    const CeedInt l_size_out_" << i << " = " << l_size << ";\n";
728       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
729       code << "    // CompStride: " << comp_stride << "\n";
730       CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl));
731       h_indices.outputs[i] = rstr_impl->d_ind;
732       code << "    writeDofsOffset" << dim << "d(num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ", num_elem, indices->outputs[" << i
733            << "], r_v_" << i << ", d_v_" << i << ");\n";
734     } else {
735       bool    has_backend_strides;
736       CeedInt num_elem;
737 
738       CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
739       CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
740       CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
741 
742       if (!has_backend_strides) {
743         CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
744       }
745       code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
746       code << "    writeDofsStrided" << dim << "d(num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
747            << ", num_elem, r_v_" << i << ", d_v_" << i << ");\n";
748     }
749   }
750 
751   code << "  }\n";
752   code << "}\n";
753   code << "// -----------------------------------------------------------------------------\n\n";
754 
755   // Copy the struct (containing device addresses) from the host to the device
756   sycl::event copy_B       = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1);
757   sycl::event copy_G       = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1);
758   sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1);
759   // These copies can happen while the JIT is being done
760   CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_B, copy_G, copy_indices}));
761 
762   // View kernel for debugging
763   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
764   CeedDebug(ceed, code.str().c_str());
765 
766   std::map<std::string, CeedInt> jit_constants;
767   jit_constants["T_1D"]         = block_sizes[0];
768   jit_constants["GROUP_SIZE_X"] = block_sizes[0];
769   jit_constants["GROUP_SIZE_Y"] = block_sizes[1];
770   jit_constants["GROUP_SIZE_Z"] = block_sizes[2];
771 
772   // Compile kernel into a kernel bundle
773   CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module, jit_constants));
774 
775   // Load kernel function
776   CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op));
777 
778   CeedCallBackend(CeedOperatorSetSetupDone(op));
779   return CEED_ERROR_SUCCESS;
780 }
781 
782 //------------------------------------------------------------------------------
783