1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 #define CEED_DEBUG_COLOR 12
9
10 #include <ceed/backend.h>
11 #include <ceed/ceed.h>
12 #include <ceed/jit-source/sycl/sycl-types.h>
13 #include <ceed/jit-tools.h>
14
15 #include <iostream>
16 #include <sstream>
17 #include <string>
18 #include <string_view>
19 #include <vector>
20
21 #include "../sycl-ref/ceed-sycl-ref.hpp"
22 #include "../sycl-shared/ceed-sycl-shared.hpp"
23 #include "../sycl/ceed-sycl-compile.hpp"
24
25 #include "ceed-sycl-gen.hpp"
26
27 //------------------------------------------------------------------------------
28 // Calculate the block size used for launching the operator kernel
29 //------------------------------------------------------------------------------
BlockGridCalculate_Sycl_gen(const CeedInt dim,const CeedInt P_1d,const CeedInt Q_1d,CeedInt * block_sizes)30 extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
31 const CeedInt thread1d = CeedIntMax(Q_1d, P_1d);
32
33 if (dim == 1) {
34 CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64;
35
36 elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
37 block_sizes[0] = thread1d;
38 block_sizes[1] = 1;
39 block_sizes[2] = elems_per_block;
40 } else if (dim == 2) {
41 const CeedInt elems_per_block = thread1d < 4 ? 16 : 2;
42
43 block_sizes[0] = thread1d;
44 block_sizes[1] = thread1d;
45 block_sizes[2] = elems_per_block;
46 } else if (dim == 3) {
47 const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1);
48
49 block_sizes[0] = thread1d;
50 block_sizes[1] = thread1d;
51 block_sizes[2] = elems_per_block;
52 }
53 return CEED_ERROR_SUCCESS;
54 }
55
56 //------------------------------------------------------------------------------
57 // Build single operator kernel
58 // - [ ] Check arguments to device functions reudsed from sycl-shared-basis are correct
59 // - [ ] Do kernel jitting!
60 //------------------------------------------------------------------------------
CeedOperatorBuildKernel_Sycl_gen(CeedOperator op)61 extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
62 Ceed ceed;
63 Ceed_Sycl *sycl_data;
64 bool is_setup_done, is_identity_qf;
65 CeedSize l_size;
66 CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1;
67 Fields_Sycl h_B, h_G;
68 FieldsInt_Sycl h_indices;
69 CeedEvalMode eval_mode;
70 CeedElemRestriction elem_rstr;
71 CeedElemRestriction_Sycl *rstr_impl;
72 CeedBasis basis;
73 CeedBasis_Sycl_shared *basis_impl;
74 CeedQFunctionField *qf_input_fields, *qf_output_fields;
75 CeedQFunction_Sycl_gen *qf_impl;
76 CeedQFunction qf;
77 CeedOperatorField *op_input_fields, *op_output_fields;
78 CeedOperator_Sycl_gen *impl;
79
80 CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
81 if (is_setup_done) return CEED_ERROR_SUCCESS;
82
83 CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
84 CeedCallBackend(CeedGetData(ceed, &sycl_data));
85
86 CeedCallBackend(CeedOperatorGetData(op, &impl));
87 CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
88 CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl));
89 CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
90 Q_1d = Q;
91
92 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
93 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
94
95 // Check for restriction only identity operator
96 CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
97 if (is_identity_qf) {
98 CeedEvalMode eval_mode_in, eval_mode_out;
99
100 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
101 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
102 CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
103 "Backend does not implement restriction only identity operators");
104 }
105
106 std::ostringstream code;
107 // TODO: generalize to accept different device functions?
108 {
109 char *tensor_basis_code;
110 const char *tensor_basis_kernel_path;
111
112 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
113 CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n");
114 CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_code));
115 code << tensor_basis_code;
116 CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
117 CeedCallBackend(CeedFree(&tensor_basis_code));
118 }
119 {
120 char *sycl_gen_template_source;
121 const char *sycl_gen_template_path;
122
123 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-gen-templates.h", &sycl_gen_template_path));
124 CeedDebug256(ceed, 2, "----- Loading Sycl-Gen Template Source -----\n");
125 CeedCallBackend(CeedLoadSourceToBuffer(ceed, sycl_gen_template_path, &sycl_gen_template_source));
126 code << sycl_gen_template_source;
127 CeedCallBackend(CeedFree(&sycl_gen_template_path));
128 CeedCallBackend(CeedFree(&sycl_gen_template_source));
129 }
130
131 std::string_view qfunction_source(qf_impl->qfunction_source);
132 std::string_view qfunction_name(qf_impl->qfunction_name);
133 const std::string operator_name = "CeedKernelSyclGenOperator_" + std::string(qfunction_name);
134
135 // Find dim, P_1d, Q_1d
136 impl->max_P_1d = 0;
137 for (CeedInt i = 0; i < num_input_fields; i++) {
138 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
139 if (basis != CEED_BASIS_NONE) {
140 bool is_tensor;
141
142 CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
143 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
144
145 // Collect dim, P_1d, and Q_1d
146 CeedCallBackend(CeedBasisGetDimension(basis, &dim));
147 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
148 if (is_tensor) {
149 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
150 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
151 if (P_1d > impl->max_P_1d) impl->max_P_1d = P_1d;
152 } else {
153 // LCOV_EXCL_START
154 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
155 // LCOV_EXCL_STOP
156 }
157 }
158 CeedCallBackend(CeedBasisDestroy(&basis));
159 }
160 // Check output bases for Q_1d, dim as well
161 // The only input basis might be CEED_BASIS_NONE
162 for (CeedInt i = 0; i < num_output_fields; i++) {
163 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
164 if (basis != CEED_BASIS_NONE) {
165 bool is_tensor;
166
167 CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
168 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
169
170 // Collect Q_1d
171 CeedCallBackend(CeedBasisGetDimension(basis, &dim));
172 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
173 if (is_tensor) {
174 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
175 } else {
176 // LCOV_EXCL_START
177 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
178 // LCOV_EXCL_STOP
179 }
180 }
181 CeedCallBackend(CeedBasisDestroy(&basis));
182 }
183 impl->dim = dim;
184 impl->Q_1d = Q_1d;
185
186 // Only use 3D collocated gradient parallelization strategy when gradient is computed
187 // TODO: put in a function?
188 bool use_collograd_parallelization = false;
189
190 if (dim == 3) {
191 bool was_grad_found = false;
192
193 for (CeedInt i = 0; i < num_input_fields; i++) {
194 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
195 if (eval_mode == CEED_EVAL_GRAD) {
196 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
197 CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
198 use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
199 was_grad_found = true;
200 CeedCallBackend(CeedBasisDestroy(&basis));
201 }
202 }
203 for (CeedInt i = 0; i < num_output_fields; i++) {
204 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
205 if (eval_mode == CEED_EVAL_GRAD) {
206 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
207 CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
208 use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
209 was_grad_found = true;
210 CeedCallBackend(CeedBasisDestroy(&basis));
211 }
212 }
213 }
214
215 CeedInt block_sizes[3];
216 CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes));
217
218 // Define CEED_Q_VLA
219 code << "\n#undef CEED_Q_VLA\n";
220 if (dim != 3 || use_collograd_parallelization) {
221 code << "#define CEED_Q_VLA 1\n\n";
222 } else {
223 code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
224 }
225
226 // Determine subgroup size based on supported sizes : Default : 16 (if supported)
227 std::vector allowed_sg_sizes = sycl_data->sycl_device.get_info<sycl::info::device::sub_group_sizes>();
228 CeedInt sub_group_size_op = allowed_sg_sizes[allowed_sg_sizes.size() - 1];
229 for (const auto &s : allowed_sg_sizes) {
230 if (s == 16) {
231 sub_group_size_op = s;
232 break;
233 }
234 }
235
236 code << qfunction_source;
237
238 // Kernel function
239 code << "\n// -----------------------------------------------------------------------------\n";
240 code << "__attribute__((reqd_work_group_size(GROUP_SIZE_X, GROUP_SIZE_Y, GROUP_SIZE_Z), intel_reqd_sub_group_size(" << sub_group_size_op << ")))\n";
241 code << "kernel void " << operator_name << "(";
242 code << "const CeedInt num_elem, ";
243 code << "global void* ctx, ";
244 code << "global const FieldsInt_Sycl* indices, ";
245 code << "global Fields_Sycl* fields, ";
246 code << "global const Fields_Sycl* B, ";
247 code << "global const Fields_Sycl* G, ";
248 code << "global const CeedScalar * restrict W";
249 code << ") {\n";
250
251 for (CeedInt i = 0; i < num_input_fields; i++) {
252 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
253 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT
254 code << " global const CeedScalar* d_u_" << i << " = fields->inputs[" << i << "];\n";
255 }
256 }
257
258 for (CeedInt i = 0; i < num_output_fields; i++) {
259 code << " global CeedScalar* d_v_" << i << " = fields->outputs[" << i << "];\n";
260 }
261
262 // TODO: Convert these to defined constants to save on GRF
263 code << " const CeedInt DIM = " << dim << ";\n";
264 code << " const CeedInt Q_1D = " << Q_1d << ";\n";
265
266 const CeedInt scratch_size = block_sizes[0] * block_sizes[1] * block_sizes[2];
267 code << " local CeedScalar scratch[" << scratch_size << "];\n";
268 code << " local CeedScalar * elem_scratch = scratch + get_local_id(2) * T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
269
270 code << "\n // -- Input field constants and basis data --\n";
271 // Initialize constants, and matrices B and G
272 for (CeedInt i = 0; i < num_input_fields; i++) {
273 code << " // ---- Input field " << i << " ----\n";
274 // Get elem_size, eval_mode, num_comp
275 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
276 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
277 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
278 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
279 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
280
281 // Set field constants
282 if (eval_mode != CEED_EVAL_WEIGHT) {
283 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
284 if (basis != CEED_BASIS_NONE) {
285 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
286 code << " const CeedInt P_in_" << i << " = " << P_1d << ";\n";
287 } else {
288 code << " const CeedInt P_in_" << i << " = " << Q_1d << ";\n";
289 }
290 code << " const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n";
291 }
292
293 // Load basis data
294 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
295 switch (eval_mode) {
296 case CEED_EVAL_NONE:
297 break;
298 case CEED_EVAL_INTERP:
299 CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
300 h_B.inputs[i] = basis_impl->d_interp_1d;
301 code << " local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
302 code << " loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n";
303 break;
304 case CEED_EVAL_GRAD:
305 CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
306 h_B.inputs[i] = basis_impl->d_interp_1d;
307 code << " local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
308 code << " loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n";
309 if (use_collograd_parallelization) {
310 h_G.inputs[i] = basis_impl->d_collo_grad_1d;
311 code << " local CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n";
312 code << " loadMatrix(Q_1D*Q_1D, G->inputs[" << i << "], s_G_in_" << i << ");\n";
313 } else {
314 bool has_collo_grad = basis_impl->d_collo_grad_1d;
315 h_G.inputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d;
316 code << " local CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
317 code << " loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_in_" + std::to_string(i))) << "*Q_1D, G->inputs[" << i << "], s_G_in_" << i
318 << ");\n";
319 }
320 break;
321 case CEED_EVAL_WEIGHT:
322 break; // No action
323 case CEED_EVAL_DIV:
324 break; // TODO: Not implemented
325 case CEED_EVAL_CURL:
326 break; // TODO: Not implemented
327 }
328 CeedCallBackend(CeedBasisDestroy(&basis));
329 }
330
331 code << "\n // -- Output field constants and basis data --\n";
332 for (CeedInt i = 0; i < num_output_fields; i++) {
333 code << " // ---- Output field " << i << " ----\n";
334 // Get elem_size, eval_mode, num_comp
335 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
336 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
337 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
338 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
339 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
340
341 // Set field constants
342 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
343 if (basis != CEED_BASIS_NONE) {
344 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
345 code << " const CeedInt P_out_" << i << " = " << P_1d << ";\n";
346 } else {
347 code << " const CeedInt P_out_" << i << " = " << Q_1d << ";\n";
348 }
349 code << " const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n";
350
351 // Load basis data
352 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
353 switch (eval_mode) {
354 case CEED_EVAL_NONE:
355 break; // No action
356 case CEED_EVAL_INTERP:
357 CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
358 h_B.outputs[i] = basis_impl->d_interp_1d;
359 code << " local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
360 code << " loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n";
361 break;
362 case CEED_EVAL_GRAD:
363 CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
364 h_B.outputs[i] = basis_impl->d_interp_1d;
365 code << " local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
366 code << " loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n";
367 if (use_collograd_parallelization) {
368 h_G.outputs[i] = basis_impl->d_collo_grad_1d;
369 code << " local CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n";
370 code << " loadMatrix(Q_1D*Q_1D, G->outputs[" << i << "], s_G_out_" << i << ");\n";
371 } else {
372 bool has_collo_grad = basis_impl->d_collo_grad_1d;
373 h_G.outputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d;
374 code << " local CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
375 code << " loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_out_" + std::to_string(i))) << "*Q_1D, G->outputs[" << i << "], s_G_out_" << i
376 << ");\n";
377 }
378 break;
379 // LCOV_EXCL_START
380 case CEED_EVAL_WEIGHT: {
381 return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
382 break; // Should not occur
383 }
384 case CEED_EVAL_DIV:
385 case CEED_EVAL_CURL: {
386 return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
387 break; // Should not occur
388 }
389 // LCOV_EXCL_STOP
390 }
391 CeedCallBackend(CeedBasisDestroy(&basis));
392 }
393 code << "\n // -- Element loop --\n";
394 code << " work_group_barrier(CLK_LOCAL_MEM_FENCE);\n";
395 code << " {\n";
396 // Input basis apply if needed
397 // Generate the correct eval mode code for each input
398 code << " // -- Input field restrictions and basis actions --\n";
399 for (CeedInt i = 0; i < num_input_fields; i++) {
400 code << " // ---- Input field " << i << " ----\n";
401 // Get elem_size, eval_mode, num_comp
402 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
403 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
404 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
405 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
406
407 // Restriction
408 if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) {
409 bool is_strided;
410
411 code << " CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n";
412
413 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
414 if (!is_strided) {
415 CeedInt comp_stride;
416
417 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
418 code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n";
419 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
420 code << " // CompStride: " << comp_stride << "\n";
421 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl));
422 h_indices.inputs[i] = rstr_impl->d_offsets;
423 code << " readDofsOffset" << dim << "d(num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ", num_elem, indices->inputs[" << i
424 << "], d_u_" << i << ", r_u_" << i << ");\n";
425 } else {
426 bool has_backend_strides;
427 CeedInt num_elem;
428
429 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
430 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
431 CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
432
433 if (!has_backend_strides) {
434 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
435 }
436 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
437 code << " readDofsStrided" << dim << "d(num_comp_in_" << i << ",P_in_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
438 << ", num_elem, d_u_" << i << ", r_u_" << i << ");\n";
439 }
440 }
441 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
442
443 // Basis action
444 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
445 switch (eval_mode) {
446 case CEED_EVAL_NONE:
447 if (!use_collograd_parallelization) {
448 code << " private CeedScalar* r_t_" << i << " = r_u_" << i << ";\n";
449 }
450 break;
451 case CEED_EVAL_INTERP:
452 code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n";
453 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" << i
454 << ", r_t_" << i << ", elem_scratch);\n";
455 break;
456 case CEED_EVAL_GRAD:
457 if (use_collograd_parallelization) {
458 code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n";
459 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_"
460 << i << ", r_t_" << i << ", elem_scratch);\n";
461 } else {
462 CeedInt P_1d;
463
464 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
465 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
466 code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM*Q_1D];\n";
467 code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_in_" << i
468 << ", P_in_" << i << ", Q_1D, r_u_" << i << (dim > 1 ? ", s_B_in_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_in_" << i
469 << ", r_t_" << i << ", elem_scratch);\n";
470 CeedCallBackend(CeedBasisDestroy(&basis));
471 }
472 break;
473 case CEED_EVAL_WEIGHT:
474 code << " CeedScalar r_t_" << i << "[Q_1D];\n";
475 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
476 CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
477 impl->W = basis_impl->d_q_weight_1d;
478 code << " Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(Q_1D, W, r_t_" << i << ");\n";
479 CeedCallBackend(CeedBasisDestroy(&basis));
480 break; // No action
481 case CEED_EVAL_DIV:
482 break; // TODO: Not implemented
483 case CEED_EVAL_CURL:
484 break; // TODO: Not implemented
485 }
486 }
487
488 // Q function
489 code << "\n // -- Output field setup --\n";
490 for (CeedInt i = 0; i < num_output_fields; i++) {
491 code << "\n // ---- Output field " << i << " ----\n";
492 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
493 if (eval_mode == CEED_EVAL_GRAD) {
494 if (use_collograd_parallelization) {
495 // Accumulator for gradient slices
496 code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n";
497 code << " for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n";
498 code << " for (CeedInt j = 0; j < Q_1D; ++j) {\n";
499 code << " r_tt_" << i << "[j + i*Q_1D] = 0.0;\n";
500 code << " }\n";
501 code << " }\n";
502 } else {
503 code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*DIM*Q_1D];\n";
504 }
505 }
506 if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
507 code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n";
508 }
509 }
510 // We treat quadrature points per slice in 3d to save registers
511 if (use_collograd_parallelization) {
512 code << "\n // Note: Using planes of 3D elements\n";
513 code << " for (CeedInt q = 0; q < Q_1D; q++) {\n";
514 code << " // -- Input fields --\n";
515 for (CeedInt i = 0; i < num_input_fields; i++) {
516 code << " // ---- Input field " << i << " ----\n";
517 // Get elem_size, eval_mode, num_comp
518 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
519 // Basis action
520 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
521 switch (eval_mode) {
522 case CEED_EVAL_NONE:
523 bool is_strided;
524
525 code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
526
527 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
528 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
529 if (!is_strided) {
530 CeedInt comp_stride;
531
532 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
533 code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n";
534 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
535 code << " // CompStride: " << comp_stride << "\n";
536 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl));
537 h_indices.inputs[i] = rstr_impl->d_offsets;
538 code << " readSliceQuadsOffset"
539 << "3d(num_comp_in_" << i << ", " << comp_stride << ", Q_1D, l_size_in_" << i << ", num_elem, q, indices->inputs[" << i << "], d_u_"
540 << i << ", r_q_" << i << ");\n";
541 } else {
542 bool has_backend_strides;
543 CeedInt num_elem;
544
545 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
546 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
547 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
548 CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
549
550 if (!has_backend_strides) {
551 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
552 }
553 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
554 code << " readSliceQuadsStrided"
555 << "3d(num_comp_in_" << i << ", Q_1D," << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i
556 << ", r_q_" << i << ");\n";
557 }
558 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
559 break;
560 case CEED_EVAL_INTERP:
561 code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
562 code << " for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n";
563 code << " r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1D];\n";
564 code << " }\n";
565 break;
566 case CEED_EVAL_GRAD:
567 code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "*DIM];\n";
568 code << " gradCollo3d(num_comp_in_" << i << ", Q_1D, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ", elem_scratch);\n";
569 break;
570 case CEED_EVAL_WEIGHT:
571 code << " CeedScalar r_q_" << i << "[1];\n";
572 code << " r_q_" << i << "[0] = r_t_" << i << "[q];\n";
573 break; // No action
574 case CEED_EVAL_DIV:
575 break; // TODO: Not implemented
576 case CEED_EVAL_CURL:
577 break; // TODO: Not implemented
578 }
579 }
580 code << "\n // -- Output fields --\n";
581 for (CeedInt i = 0; i < num_output_fields; i++) {
582 code << " // ---- Output field " << i << " ----\n";
583 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
584 // Basis action
585 switch (eval_mode) {
586 case CEED_EVAL_NONE:
587 code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
588 break; // No action
589 case CEED_EVAL_INTERP:
590 code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
591 break;
592 case CEED_EVAL_GRAD:
593 code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*DIM];\n";
594 break;
595 case CEED_EVAL_WEIGHT:
596 break; // Should not occur
597 case CEED_EVAL_DIV:
598 break; // TODO: Not implemented
599 case CEED_EVAL_CURL:
600 break; // TODO: Not implemented
601 }
602 }
603 } else {
604 code << "\n // Note: Using full elements\n";
605 code << " // -- Input fields --\n";
606 for (CeedInt i = 0; i < num_input_fields; i++) {
607 code << " // ---- Input field " << i << " ----\n";
608 code << " private CeedScalar* r_q_" << i << " = r_t_" << i << ";\n";
609 }
610 code << " // -- Output fields --\n";
611 for (CeedInt i = 0; i < num_output_fields; i++) {
612 code << " // ---- Output field " << i << " ----\n";
613 code << " private CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n";
614 }
615 }
616 //--------------------------------------------------
617 code << "\n // -- QFunction Inputs and outputs --\n";
618 code << " const CeedScalar * in[" << num_input_fields << "];\n";
619 for (CeedInt i = 0; i < num_input_fields; i++) {
620 code << " // ---- Input field " << i << " ----\n";
621 code << " in[" << i << "] = r_q_" << i << ";\n";
622 }
623 code << " CeedScalar * out[" << num_output_fields << "];\n";
624 for (CeedInt i = 0; i < num_output_fields; i++) {
625 code << " // ---- Output field " << i << " ----\n";
626 code << " out[" << i << "] = r_qq_" << i << ";\n";
627 }
628
629 code << "\n // -- Apply QFunction --\n";
630 code << " " << qfunction_name << "(ctx, ";
631 if (dim != 3 || use_collograd_parallelization) {
632 code << "1";
633 } else {
634 code << "Q_1D";
635 }
636 code << ", in, out);\n";
637 //--------------------------------------------------
638
639 if (use_collograd_parallelization) {
640 code << " // -- Output fields --\n";
641 for (CeedInt i = 0; i < num_output_fields; i++) {
642 code << " // ---- Output field " << i << " ----\n";
643 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
644 // Basis action
645 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
646 switch (eval_mode) {
647 case CEED_EVAL_NONE:
648 code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
649 code << " r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n";
650 code << " }\n";
651 break; // No action
652 case CEED_EVAL_INTERP:
653 code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
654 code << " r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n";
655 code << " }\n";
656 break;
657 case CEED_EVAL_GRAD:
658 code << " gradColloTranspose3d(num_comp_out_" << i << ",Q_1D, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i
659 << ", elem_scratch);\n";
660 break;
661 case CEED_EVAL_WEIGHT:
662 break; // Should not occur
663 case CEED_EVAL_DIV:
664 break; // TODO: Not implemented
665 case CEED_EVAL_CURL:
666 break; // TODO: Not implemented
667 }
668 }
669 code << " }\n";
670 }
671
672 // Output basis apply if needed
673 // Generate the correct eval mode code for each output
674 code << "\n // -- Output field basis action and restrictions --\n";
675 for (CeedInt i = 0; i < num_output_fields; i++) {
676 code << " // ---- Output field " << i << " ----\n";
677 // Get elem_size, eval_mode, num_comp
678 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
679 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
680 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
681 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
682 // Basis action
683 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
684 switch (eval_mode) {
685 case CEED_EVAL_NONE:
686 code << " private CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n";
687 break; // No action
688 case CEED_EVAL_INTERP:
689 code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
690 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i
691 << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n";
692 break;
693 case CEED_EVAL_GRAD:
694 code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
695 if (use_collograd_parallelization) {
696 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i
697 << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n";
698 } else {
699 CeedInt P_1d;
700 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
701 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
702 code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_out_" << i
703 << ", P_out_" << i << ", Q_1D, r_tt_" << i << (dim > 1 ? ", s_B_out_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_out_" << i
704 << ", r_v_" << i << ", elem_scratch);\n";
705 CeedCallBackend(CeedBasisDestroy(&basis));
706 }
707 break;
708 // LCOV_EXCL_START
709 case CEED_EVAL_WEIGHT: {
710 return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
711 break; // Should not occur
712 }
713 case CEED_EVAL_DIV:
714 case CEED_EVAL_CURL: {
715 return CeedError(CeedOperatorReturnCeed(op), CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]);
716 break; // Should not occur
717 }
718 // LCOV_EXCL_STOP
719 }
720 // Restriction
721 bool is_strided;
722
723 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
724 if (!is_strided) {
725 CeedInt comp_stride;
726
727 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
728 code << " const CeedInt l_size_out_" << i << " = " << l_size << ";\n";
729 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
730 code << " // CompStride: " << comp_stride << "\n";
731 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl));
732 h_indices.outputs[i] = rstr_impl->d_offsets;
733 code << " writeDofsOffset" << dim << "d(num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ", num_elem, indices->outputs[" << i
734 << "], r_v_" << i << ", d_v_" << i << ");\n";
735 } else {
736 bool has_backend_strides;
737 CeedInt num_elem;
738
739 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
740 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
741 CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
742
743 if (!has_backend_strides) {
744 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
745 }
746 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
747 code << " writeDofsStrided" << dim << "d(num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
748 << ", num_elem, r_v_" << i << ", d_v_" << i << ");\n";
749 }
750 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
751 }
752
753 code << " }\n";
754 code << "}\n";
755 code << "// -----------------------------------------------------------------------------\n\n";
756
757 // Copy the struct (containing device addresses) from the host to the device
758 std::vector<sycl::event> e;
759
760 if (!sycl_data->sycl_queue.is_in_order()) e = {sycl_data->sycl_queue.ext_oneapi_submit_barrier()};
761
762 sycl::event copy_B = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1, e);
763 sycl::event copy_G = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1, e);
764 sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1, e);
765 // These copies can happen while the JIT is being done
766 CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_B, copy_G, copy_indices}));
767
768 // View kernel for debugging
769 CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
770 CeedDebug(ceed, code.str().c_str());
771
772 std::map<std::string, CeedInt> jit_constants;
773 jit_constants["T_1D"] = block_sizes[0];
774 jit_constants["GROUP_SIZE_X"] = block_sizes[0];
775 jit_constants["GROUP_SIZE_Y"] = block_sizes[1];
776 jit_constants["GROUP_SIZE_Z"] = block_sizes[2];
777
778 // Compile kernel into a kernel bundle
779 CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module, jit_constants));
780
781 // Load kernel function
782 CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op));
783 CeedCallBackend(CeedOperatorSetSetupDone(op));
784 CeedCallBackend(CeedDestroy(&ceed));
785 CeedCallBackend(CeedQFunctionDestroy(&qf));
786 return CEED_ERROR_SUCCESS;
787 }
788
789 //------------------------------------------------------------------------------
790