1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #define CEED_DEBUG_COLOR 12 9 10 #include <ceed/backend.h> 11 #include <ceed/ceed.h> 12 #include <ceed/jit-source/sycl/sycl-types.h> 13 #include <ceed/jit-tools.h> 14 15 #include <iostream> 16 #include <sstream> 17 #include <string> 18 #include <string_view> 19 #include <vector> 20 21 #include "../sycl-ref/ceed-sycl-ref.hpp" 22 #include "../sycl-shared/ceed-sycl-shared.hpp" 23 #include "../sycl/ceed-sycl-compile.hpp" 24 25 #include "ceed-sycl-gen.hpp" 26 27 //------------------------------------------------------------------------------ 28 // Calculate the block size used for launching the operator kernel 29 //------------------------------------------------------------------------------ 30 extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { 31 const CeedInt thread1d = CeedIntMax(Q_1d, P_1d); 32 if (dim == 1) { 33 CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64; 34 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 35 block_sizes[0] = thread1d; 36 block_sizes[1] = 1; 37 block_sizes[2] = elems_per_block; 38 } else if (dim == 2) { 39 const CeedInt elems_per_block = thread1d < 4 ? 16 : 2; 40 block_sizes[0] = thread1d; 41 block_sizes[1] = thread1d; 42 block_sizes[2] = elems_per_block; 43 } else if (dim == 3) { 44 const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); 45 block_sizes[0] = thread1d; 46 block_sizes[1] = thread1d; 47 block_sizes[2] = elems_per_block; 48 } 49 return CEED_ERROR_SUCCESS; 50 } 51 52 //------------------------------------------------------------------------------ 53 // Build single operator kernel 54 // - [ ] Check arguments to device functions reudsed from sycl-shared-basis are correct 55 // - [ ] Do kernel jitting! 56 //------------------------------------------------------------------------------ 57 extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { 58 bool is_setup_done; 59 CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 60 if (is_setup_done) return CEED_ERROR_SUCCESS; 61 62 Ceed ceed; 63 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 64 Ceed_Sycl *sycl_data; 65 CeedCallBackend(CeedGetData(ceed, &sycl_data)); 66 67 CeedOperator_Sycl_gen *impl; 68 CeedCallBackend(CeedOperatorGetData(op, &impl)); 69 Fields_Sycl h_B, h_G; 70 FieldsInt_Sycl h_indices; 71 CeedQFunction qf; 72 CeedQFunction_Sycl_gen *qf_impl; 73 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 74 CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl)); 75 CeedSize lsize; 76 CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; 77 CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 78 Q_1d = Q; 79 80 CeedOperatorField *op_input_fields, *op_output_fields; 81 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 82 CeedQFunctionField *qf_input_fields, *qf_output_fields; 83 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 84 85 CeedEvalMode eval_mode; 86 CeedBasis basis; 87 CeedBasis_Sycl_shared *basis_impl; 88 CeedElemRestriction Erestrict; 89 CeedElemRestriction_Sycl *restr_impl; 90 91 // Check for restriction only identity operator 92 bool is_identity_qf; 93 CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 94 if (is_identity_qf) { 95 CeedEvalMode eval_mode_in, eval_mode_out; 96 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); 97 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); 98 if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) { 99 // LCOV_EXCL_START 100 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement restriction only identity operators"); 101 // LCOV_EXCL_STOP 102 } 103 } 104 105 std::ostringstream code; 106 // TODO: generalize to accept different device functions? 107 { 108 char *tensor_basis_kernel_path, *tensor_basis_code; 109 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h", &tensor_basis_kernel_path)); 110 CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n"); 111 CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_code)); 112 code << tensor_basis_code; 113 CeedCallBackend(CeedFree(&tensor_basis_kernel_path)); 114 CeedCallBackend(CeedFree(&tensor_basis_code)); 115 } 116 { 117 char *sycl_gen_template_path, *sycl_gen_template_source; 118 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-gen-templates.h", &sycl_gen_template_path)); 119 CeedDebug256(ceed, 2, "----- Loading Sycl-Gen Template Source -----\n"); 120 CeedCallBackend(CeedLoadSourceToBuffer(ceed, sycl_gen_template_path, &sycl_gen_template_source)); 121 code << sycl_gen_template_source; 122 CeedCallBackend(CeedFree(&sycl_gen_template_path)); 123 CeedCallBackend(CeedFree(&sycl_gen_template_source)); 124 } 125 126 std::string_view q_function_source(qf_impl->q_function_source); 127 std::string_view q_function_name(qf_impl->q_function_name); 128 const std::string operator_name = "CeedKernelSyclGenOperator_" + std::string(q_function_name); 129 130 // Find dim, P_1d, Q_1d 131 impl->max_P_1d = 0; 132 for (CeedInt i = 0; i < num_input_fields; i++) { 133 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 134 if (basis != CEED_BASIS_NONE) { 135 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 136 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 137 138 // Collect dim, P_1d, and Q_1d 139 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 140 bool isTensor; 141 CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); 142 if (isTensor) { 143 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 144 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 145 if (P_1d > impl->max_P_1d) impl->max_P_1d = P_1d; 146 } else { 147 // LCOV_EXCL_START 148 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 149 // LCOV_EXCL_STOP 150 } 151 } 152 } 153 // Check output bases for Q_1d, dim as well 154 // The only input basis might be CEED_BASIS_NONE 155 for (CeedInt i = 0; i < num_output_fields; i++) { 156 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 157 158 if (basis != CEED_BASIS_NONE) { 159 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 160 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 161 162 // Collect Q_1d 163 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 164 bool isTensor; 165 CeedCallBackend(CeedBasisIsTensor(basis, &isTensor)); 166 if (isTensor) { 167 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 168 } else { 169 // LCOV_EXCL_START 170 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 171 // LCOV_EXCL_STOP 172 } 173 } 174 } 175 impl->dim = dim; 176 impl->Q_1d = Q_1d; 177 178 // Only use 3D collocated gradient parallelization strategy when gradient is computed 179 // TODO: put in a function? 180 bool use_collograd_parallelization = false; 181 if (dim == 3) { 182 bool was_grad_found = false; 183 for (CeedInt i = 0; i < num_input_fields; i++) { 184 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 185 if (eval_mode == CEED_EVAL_GRAD) { 186 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 187 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 188 use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 189 was_grad_found = true; 190 } 191 } 192 for (CeedInt i = 0; i < num_output_fields; i++) { 193 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 194 if (eval_mode == CEED_EVAL_GRAD) { 195 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 196 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 197 use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 198 was_grad_found = true; 199 } 200 } 201 } 202 203 CeedInt block_sizes[3]; 204 CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes)); 205 206 // Define CEED_Q_VLA 207 code << "\n#undef CEED_Q_VLA\n"; 208 if (dim != 3 || use_collograd_parallelization) { 209 code << "#define CEED_Q_VLA 1\n\n"; 210 } else { 211 code << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 212 } 213 214 // Determine subgroup size based on supported sizes : Default : 16 (if supported) 215 std::vector allowed_sg_sizes = sycl_data->sycl_device.get_info<sycl::info::device::sub_group_sizes>(); 216 CeedInt sub_group_size_op = allowed_sg_sizes[allowed_sg_sizes.size() - 1]; 217 for (const auto &s : allowed_sg_sizes) { 218 if (s == 16) { 219 sub_group_size_op = s; 220 break; 221 } 222 } 223 224 code << q_function_source; 225 226 // Kernel function 227 code << "\n// -----------------------------------------------------------------------------\n"; 228 code << "__attribute__((reqd_work_group_size(GROUP_SIZE_X, GROUP_SIZE_Y, GROUP_SIZE_Z), intel_reqd_sub_group_size(" << sub_group_size_op << ")))\n"; 229 code << "kernel void " << operator_name << "("; 230 code << "const CeedInt num_elem, "; 231 code << "global void* ctx, "; 232 code << "global const FieldsInt_Sycl* indices, "; 233 code << "global Fields_Sycl* fields, "; 234 code << "global const Fields_Sycl* B, "; 235 code << "global const Fields_Sycl* G, "; 236 code << "global const CeedScalar * restrict W"; 237 code << ") {\n"; 238 239 for (CeedInt i = 0; i < num_input_fields; i++) { 240 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 241 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 242 code << " global const CeedScalar* d_u_" << i << " = fields->inputs[" << i << "];\n"; 243 } 244 } 245 246 for (CeedInt i = 0; i < num_output_fields; i++) { 247 code << " global CeedScalar* d_v_" << i << " = fields->outputs[" << i << "];\n"; 248 } 249 250 // TODO: Convert these to defined constants to save on GRF 251 code << " const CeedInt DIM = " << dim << ";\n"; 252 code << " const CeedInt Q_1D = " << Q_1d << ";\n"; 253 254 const CeedInt scratch_size = block_sizes[0] * block_sizes[1] * block_sizes[2]; 255 code << " local CeedScalar scratch[" << scratch_size << "];\n"; 256 code << " local CeedScalar * elem_scratch = scratch + get_local_id(2) * T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n"; 257 258 code << "\n // -- Input field constants and basis data --\n"; 259 // Initialize constants, and matrices B and G 260 for (CeedInt i = 0; i < num_input_fields; i++) { 261 code << " // ---- Input field " << i << " ----\n"; 262 // Get elem_size, eval_mode, num_comp 263 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict)); 264 CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); 265 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 266 CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); 267 268 // Set field constants 269 if (eval_mode != CEED_EVAL_WEIGHT) { 270 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 271 if (basis != CEED_BASIS_NONE) { 272 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 273 code << " const CeedInt P_in_" << i << " = " << P_1d << ";\n"; 274 } else { 275 code << " const CeedInt P_in_" << i << " = " << Q_1d << ";\n"; 276 } 277 code << " const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n"; 278 } 279 280 // Load basis data 281 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 282 switch (eval_mode) { 283 case CEED_EVAL_NONE: 284 break; 285 case CEED_EVAL_INTERP: 286 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 287 h_B.inputs[i] = basis_impl->d_interp_1d; 288 code << " local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; 289 code << " loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n"; 290 break; 291 case CEED_EVAL_GRAD: 292 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 293 h_B.inputs[i] = basis_impl->d_interp_1d; 294 code << " local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; 295 code << " loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n"; 296 if (use_collograd_parallelization) { 297 h_G.inputs[i] = basis_impl->d_collo_grad_1d; 298 code << " local CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n"; 299 code << " loadMatrix(Q_1D*Q_1D, G->inputs[" << i << "], s_G_in_" << i << ");\n"; 300 } else { 301 bool has_collo_grad = basis_impl->d_collo_grad_1d; 302 h_G.inputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d; 303 code << " local CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; 304 code << " loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_in_" + std::to_string(i))) << "*Q_1D, G->inputs[" << i << "], s_G_in_" << i 305 << ");\n"; 306 } 307 break; 308 case CEED_EVAL_WEIGHT: 309 break; // No action 310 case CEED_EVAL_DIV: 311 break; // TODO: Not implemented 312 case CEED_EVAL_CURL: 313 break; // TODO: Not implemented 314 } 315 } 316 317 code << "\n // -- Output field constants and basis data --\n"; 318 for (CeedInt i = 0; i < num_output_fields; i++) { 319 code << " // ---- Output field " << i << " ----\n"; 320 // Get elem_size, eval_mode, num_comp 321 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict)); 322 CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); 323 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 324 CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); 325 326 // Set field constants 327 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 328 if (basis != CEED_BASIS_NONE) { 329 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 330 code << " const CeedInt P_out_" << i << " = " << P_1d << ";\n"; 331 } else { 332 code << " const CeedInt P_out_" << i << " = " << Q_1d << ";\n"; 333 } 334 code << " const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n"; 335 336 // Load basis data 337 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 338 switch (eval_mode) { 339 case CEED_EVAL_NONE: 340 break; // No action 341 case CEED_EVAL_INTERP: 342 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 343 h_B.outputs[i] = basis_impl->d_interp_1d; 344 code << " local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; 345 code << " loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n"; 346 break; 347 case CEED_EVAL_GRAD: 348 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 349 h_B.outputs[i] = basis_impl->d_interp_1d; 350 code << " local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; 351 code << " loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n"; 352 if (use_collograd_parallelization) { 353 h_G.outputs[i] = basis_impl->d_collo_grad_1d; 354 code << " local CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n"; 355 code << " loadMatrix(Q_1D*Q_1D, G->outputs[" << i << "], s_G_out_" << i << ");\n"; 356 } else { 357 bool has_collo_grad = basis_impl->d_collo_grad_1d; 358 h_G.outputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d; 359 code << " local CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; 360 code << " loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_out_" + std::to_string(i))) << "*Q_1D, G->outputs[" << i << "], s_G_out_" << i 361 << ");\n"; 362 } 363 break; 364 // LCOV_EXCL_START 365 case CEED_EVAL_WEIGHT: { 366 Ceed ceed; 367 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 368 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 369 break; // Should not occur 370 } 371 case CEED_EVAL_DIV: 372 break; // TODO: Not implemented 373 case CEED_EVAL_CURL: 374 break; // TODO: Not implemented 375 // LCOV_EXCL_STOP 376 } 377 } 378 code << "\n // -- Element loop --\n"; 379 code << " work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"; 380 code << " {\n"; 381 // Input basis apply if needed 382 // Generate the correct eval mode code for each input 383 code << " // -- Input field restrictions and basis actions --\n"; 384 for (CeedInt i = 0; i < num_input_fields; i++) { 385 code << " // ---- Input field " << i << " ----\n"; 386 // Get elem_size, eval_mode, num_comp 387 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict)); 388 CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); 389 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 390 CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); 391 392 // Restriction 393 if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { 394 code << " CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n"; 395 396 bool is_strided; 397 CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); 398 if (!is_strided) { 399 CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize)); 400 code << " const CeedInt lsize_in_" << i << " = " << lsize << ";\n"; 401 CeedInt comp_stride; 402 CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride)); 403 code << " // CompStride: " << comp_stride << "\n"; 404 CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl)); 405 h_indices.inputs[i] = restr_impl->d_ind; 406 code << " readDofsOffset" << dim << "d(num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ", num_elem, indices->inputs[" << i 407 << "], d_u_" << i << ", r_u_" << i << ");\n"; 408 } else { 409 bool has_backend_strides; 410 CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); 411 CeedInt num_elem; 412 CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); 413 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 414 if (!has_backend_strides) { 415 CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); 416 } 417 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 418 code << " readDofsStrided" << dim << "d(num_comp_in_" << i << ",P_in_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2] 419 << ", num_elem, d_u_" << i << ", r_u_" << i << ");\n"; 420 } 421 } 422 423 // Basis action 424 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 425 switch (eval_mode) { 426 case CEED_EVAL_NONE: 427 if (!use_collograd_parallelization) { 428 code << " private CeedScalar* r_t_" << i << " = r_u_" << i << ";\n"; 429 } 430 break; 431 case CEED_EVAL_INTERP: 432 code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n"; 433 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" << i 434 << ", r_t_" << i << ", elem_scratch);\n"; 435 break; 436 case CEED_EVAL_GRAD: 437 if (use_collograd_parallelization) { 438 code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n"; 439 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" 440 << i << ", r_t_" << i << ", elem_scratch);\n"; 441 } else { 442 CeedInt P_1d; 443 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 444 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 445 code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM*Q_1D];\n"; 446 code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_in_" << i 447 << ", P_in_" << i << ", Q_1D, r_u_" << i << (dim > 1 ? ", s_B_in_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_in_" << i 448 << ", r_t_" << i << ", elem_scratch);\n"; 449 } 450 break; 451 case CEED_EVAL_WEIGHT: 452 code << " CeedScalar r_t_" << i << "[Q_1D];\n"; 453 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 454 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 455 impl->W = basis_impl->d_q_weight_1d; 456 code << " Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(Q_1D, W, r_t_" << i << ");\n"; 457 break; // No action 458 case CEED_EVAL_DIV: 459 break; // TODO: Not implemented 460 case CEED_EVAL_CURL: 461 break; // TODO: Not implemented 462 } 463 } 464 465 // Q function 466 code << "\n // -- Output field setup --\n"; 467 for (CeedInt i = 0; i < num_output_fields; i++) { 468 code << "\n // ---- Output field " << i << " ----\n"; 469 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 470 if (eval_mode == CEED_EVAL_GRAD) { 471 if (use_collograd_parallelization) { 472 // Accumulator for gradient slices 473 code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n"; 474 code << " for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n"; 475 code << " for (CeedInt j = 0; j < Q_1D; ++j) {\n"; 476 code << " r_tt_" << i << "[j + i*Q_1D] = 0.0;\n"; 477 code << " }\n"; 478 code << " }\n"; 479 } else { 480 code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*DIM*Q_1D];\n"; 481 } 482 } 483 if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) { 484 code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n"; 485 } 486 } 487 // We treat quadrature points per slice in 3d to save registers 488 if (use_collograd_parallelization) { 489 code << "\n // Note: Using planes of 3D elements\n"; 490 code << " for (CeedInt q = 0; q < Q_1D; q++) {\n"; 491 code << " // -- Input fields --\n"; 492 for (CeedInt i = 0; i < num_input_fields; i++) { 493 code << " // ---- Input field " << i << " ----\n"; 494 // Get elem_size, eval_mode, num_comp 495 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 496 // Basis action 497 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 498 switch (eval_mode) { 499 case CEED_EVAL_NONE: 500 code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; 501 502 bool is_strided; 503 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict)); 504 CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); 505 if (!is_strided) { 506 CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize)); 507 code << " const CeedInt lsize_in_" << i << " = " << lsize << ";\n"; 508 CeedInt comp_stride; 509 CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride)); 510 code << " // CompStride: " << comp_stride << "\n"; 511 CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl)); 512 h_indices.inputs[i] = restr_impl->d_ind; 513 code << " readSliceQuadsOffset" 514 << "3d(num_comp_in_" << i << ", " << comp_stride << ", Q_1D, lsize_in_" << i << ", num_elem, q, indices->inputs[" << i << "], d_u_" 515 << i << ", r_q_" << i << ");\n"; 516 } else { 517 CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); 518 bool has_backend_strides; 519 CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); 520 CeedInt num_elem; 521 CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); 522 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 523 if (!has_backend_strides) { 524 CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); 525 } 526 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 527 code << " readSliceQuadsStrided" 528 << "3d(num_comp_in_" << i << ", Q_1D," << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i 529 << ", r_q_" << i << ");\n"; 530 } 531 break; 532 case CEED_EVAL_INTERP: 533 code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; 534 code << " for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n"; 535 code << " r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1D];\n"; 536 code << " }\n"; 537 break; 538 case CEED_EVAL_GRAD: 539 code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "*DIM];\n"; 540 code << " gradCollo3d(num_comp_in_" << i << ", Q_1D, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ", elem_scratch);\n"; 541 break; 542 case CEED_EVAL_WEIGHT: 543 code << " CeedScalar r_q_" << i << "[1];\n"; 544 code << " r_q_" << i << "[0] = r_t_" << i << "[q];\n"; 545 break; // No action 546 case CEED_EVAL_DIV: 547 break; // TODO: Not implemented 548 case CEED_EVAL_CURL: 549 break; // TODO: Not implemented 550 } 551 } 552 code << "\n // -- Output fields --\n"; 553 for (CeedInt i = 0; i < num_output_fields; i++) { 554 code << " // ---- Output field " << i << " ----\n"; 555 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 556 // Basis action 557 switch (eval_mode) { 558 case CEED_EVAL_NONE: 559 code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n"; 560 break; // No action 561 case CEED_EVAL_INTERP: 562 code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n"; 563 break; 564 case CEED_EVAL_GRAD: 565 code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*DIM];\n"; 566 break; 567 case CEED_EVAL_WEIGHT: 568 break; // Should not occur 569 case CEED_EVAL_DIV: 570 break; // TODO: Not implemented 571 case CEED_EVAL_CURL: 572 break; // TODO: Not implemented 573 } 574 } 575 } else { 576 code << "\n // Note: Using full elements\n"; 577 code << " // -- Input fields --\n"; 578 for (CeedInt i = 0; i < num_input_fields; i++) { 579 code << " // ---- Input field " << i << " ----\n"; 580 code << " private CeedScalar* r_q_" << i << " = r_t_" << i << ";\n"; 581 } 582 code << " // -- Output fields --\n"; 583 for (CeedInt i = 0; i < num_output_fields; i++) { 584 code << " // ---- Output field " << i << " ----\n"; 585 code << " private CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n"; 586 } 587 } 588 //-------------------------------------------------- 589 code << "\n // -- QFunction Inputs and outputs --\n"; 590 code << " const CeedScalar * in[" << num_input_fields << "];\n"; 591 for (CeedInt i = 0; i < num_input_fields; i++) { 592 code << " // ---- Input field " << i << " ----\n"; 593 code << " in[" << i << "] = r_q_" << i << ";\n"; 594 } 595 code << " CeedScalar * out[" << num_output_fields << "];\n"; 596 for (CeedInt i = 0; i < num_output_fields; i++) { 597 code << " // ---- Output field " << i << " ----\n"; 598 code << " out[" << i << "] = r_qq_" << i << ";\n"; 599 } 600 601 code << "\n // -- Apply QFunction --\n"; 602 code << " " << q_function_name << "(ctx, "; 603 if (dim != 3 || use_collograd_parallelization) { 604 code << "1"; 605 } else { 606 code << "Q_1D"; 607 } 608 code << ", in, out);\n"; 609 //-------------------------------------------------- 610 611 if (use_collograd_parallelization) { 612 code << " // -- Output fields --\n"; 613 for (CeedInt i = 0; i < num_output_fields; i++) { 614 code << " // ---- Output field " << i << " ----\n"; 615 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 616 // Basis action 617 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 618 switch (eval_mode) { 619 case CEED_EVAL_NONE: 620 code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n"; 621 code << " r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n"; 622 code << " }\n"; 623 break; // No action 624 case CEED_EVAL_INTERP: 625 code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n"; 626 code << " r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n"; 627 code << " }\n"; 628 break; 629 case CEED_EVAL_GRAD: 630 code << " gradColloTranspose3d(num_comp_out_" << i << ",Q_1D, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i 631 << ", elem_scratch);\n"; 632 break; 633 case CEED_EVAL_WEIGHT: 634 break; // Should not occur 635 case CEED_EVAL_DIV: 636 break; // TODO: Not implemented 637 case CEED_EVAL_CURL: 638 break; // TODO: Not implemented 639 } 640 } 641 code << " }\n"; 642 } 643 644 // Output basis apply if needed 645 // Generate the correct eval mode code for each output 646 code << "\n // -- Output field basis action and restrictions --\n"; 647 for (CeedInt i = 0; i < num_output_fields; i++) { 648 code << " // ---- Output field " << i << " ----\n"; 649 // Get elem_size, eval_mode, num_comp 650 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict)); 651 CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size)); 652 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 653 CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp)); 654 // Basis action 655 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 656 switch (eval_mode) { 657 case CEED_EVAL_NONE: 658 code << " private CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n"; 659 break; // No action 660 case CEED_EVAL_INTERP: 661 code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; 662 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i 663 << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n"; 664 break; 665 case CEED_EVAL_GRAD: 666 code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; 667 if (use_collograd_parallelization) { 668 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i 669 << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n"; 670 } else { 671 CeedInt P_1d; 672 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 673 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 674 code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_out_" << i 675 << ", P_out_" << i << ", Q_1D, r_tt_" << i << (dim > 1 ? ", s_B_out_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_out_" << i 676 << ", r_v_" << i << ", elem_scratch);\n"; 677 } 678 break; 679 // LCOV_EXCL_START 680 case CEED_EVAL_WEIGHT: { 681 Ceed ceed; 682 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 683 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 684 break; // Should not occur 685 } 686 case CEED_EVAL_DIV: 687 break; // TODO: Not implemented 688 case CEED_EVAL_CURL: 689 break; // TODO: Not implemented 690 // LCOV_EXCL_STOP 691 } 692 // Restriction 693 bool is_strided; 694 CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided)); 695 if (!is_strided) { 696 CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize)); 697 code << " const CeedInt lsize_out_" << i << " = " << lsize << ";\n"; 698 CeedInt comp_stride; 699 CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride)); 700 code << " // CompStride: " << comp_stride << "\n"; 701 CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl)); 702 h_indices.outputs[i] = restr_impl->d_ind; 703 code << " writeDofsOffset" << dim << "d(num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ", num_elem, indices->outputs[" << i 704 << "], r_v_" << i << ", d_v_" << i << ");\n"; 705 } else { 706 bool has_backend_strides; 707 CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides)); 708 CeedInt num_elem; 709 CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem)); 710 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 711 if (!has_backend_strides) { 712 CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides)); 713 } 714 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 715 code << " writeDofsStrided" << dim << "d(num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2] 716 << ", num_elem, r_v_" << i << ", d_v_" << i << ");\n"; 717 } 718 } 719 720 code << " }\n"; 721 code << "}\n"; 722 code << "// -----------------------------------------------------------------------------\n\n"; 723 724 // Copy the struct (containing device addresses) from the host to the device 725 sycl::event copy_B = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1); 726 sycl::event copy_G = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1); 727 sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1); 728 // These copies can happen while the JIT is being done 729 CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_B, copy_G, copy_indices})); 730 731 // View kernel for debugging 732 CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 733 CeedDebug(ceed, code.str().c_str()); 734 735 std::map<std::string, CeedInt> jit_constants; 736 jit_constants["T_1D"] = block_sizes[0]; 737 jit_constants["GROUP_SIZE_X"] = block_sizes[0]; 738 jit_constants["GROUP_SIZE_Y"] = block_sizes[1]; 739 jit_constants["GROUP_SIZE_Z"] = block_sizes[2]; 740 741 // Compile kernel into a kernel bundle 742 CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module, jit_constants)); 743 744 // Load kernel function 745 CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op)); 746 747 CeedCallBackend(CeedOperatorSetSetupDone(op)); 748 return CEED_ERROR_SUCCESS; 749 } 750 751 //------------------------------------------------------------------------------ 752