1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #define CEED_DEBUG_COLOR 12 9 10 #include <ceed/backend.h> 11 #include <ceed/ceed.h> 12 #include <ceed/jit-source/sycl/sycl-types.h> 13 #include <ceed/jit-tools.h> 14 15 #include <iostream> 16 #include <sstream> 17 #include <string> 18 #include <string_view> 19 #include <vector> 20 21 #include "../sycl-ref/ceed-sycl-ref.hpp" 22 #include "../sycl-shared/ceed-sycl-shared.hpp" 23 #include "../sycl/ceed-sycl-compile.hpp" 24 25 #include "ceed-sycl-gen.hpp" 26 27 //------------------------------------------------------------------------------ 28 // Calculate the block size used for launching the operator kernel 29 //------------------------------------------------------------------------------ 30 extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { 31 const CeedInt thread1d = CeedIntMax(Q_1d, P_1d); 32 33 if (dim == 1) { 34 CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64; 35 36 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 37 block_sizes[0] = thread1d; 38 block_sizes[1] = 1; 39 block_sizes[2] = elems_per_block; 40 } else if (dim == 2) { 41 const CeedInt elems_per_block = thread1d < 4 ? 16 : 2; 42 43 block_sizes[0] = thread1d; 44 block_sizes[1] = thread1d; 45 block_sizes[2] = elems_per_block; 46 } else if (dim == 3) { 47 const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); 48 49 block_sizes[0] = thread1d; 50 block_sizes[1] = thread1d; 51 block_sizes[2] = elems_per_block; 52 } 53 return CEED_ERROR_SUCCESS; 54 } 55 56 //------------------------------------------------------------------------------ 57 // Build single operator kernel 58 // - [ ] Check arguments to device functions reudsed from sycl-shared-basis are correct 59 // - [ ] Do kernel jitting! 60 //------------------------------------------------------------------------------ 61 extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { 62 Ceed ceed; 63 Ceed_Sycl *sycl_data; 64 bool is_setup_done, is_identity_qf; 65 CeedSize l_size; 66 CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; 67 Fields_Sycl h_B, h_G; 68 FieldsInt_Sycl h_indices; 69 CeedEvalMode eval_mode; 70 CeedElemRestriction elem_rstr; 71 CeedElemRestriction_Sycl *rstr_impl; 72 CeedBasis basis; 73 CeedBasis_Sycl_shared *basis_impl; 74 CeedQFunctionField *qf_input_fields, *qf_output_fields; 75 CeedQFunction_Sycl_gen *qf_impl; 76 CeedQFunction qf; 77 CeedOperatorField *op_input_fields, *op_output_fields; 78 CeedOperator_Sycl_gen *impl; 79 80 CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 81 if (is_setup_done) return CEED_ERROR_SUCCESS; 82 83 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 84 CeedCallBackend(CeedGetData(ceed, &sycl_data)); 85 86 CeedCallBackend(CeedOperatorGetData(op, &impl)); 87 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 88 CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl)); 89 CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 90 Q_1d = Q; 91 92 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 93 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 94 95 // Check for restriction only identity operator 96 CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 97 if (is_identity_qf) { 98 CeedEvalMode eval_mode_in, eval_mode_out; 99 100 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); 101 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); 102 CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, 103 "Backend does not implement restriction only identity operators"); 104 } 105 106 std::ostringstream code; 107 // TODO: generalize to accept different device functions? 108 { 109 char *tensor_basis_code; 110 const char *tensor_basis_kernel_path; 111 112 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h", &tensor_basis_kernel_path)); 113 CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n"); 114 CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_code)); 115 code << tensor_basis_code; 116 CeedCallBackend(CeedFree(&tensor_basis_kernel_path)); 117 CeedCallBackend(CeedFree(&tensor_basis_code)); 118 } 119 { 120 char *sycl_gen_template_source; 121 const char *sycl_gen_template_path; 122 123 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-gen-templates.h", &sycl_gen_template_path)); 124 CeedDebug256(ceed, 2, "----- Loading Sycl-Gen Template Source -----\n"); 125 CeedCallBackend(CeedLoadSourceToBuffer(ceed, sycl_gen_template_path, &sycl_gen_template_source)); 126 code << sycl_gen_template_source; 127 CeedCallBackend(CeedFree(&sycl_gen_template_path)); 128 CeedCallBackend(CeedFree(&sycl_gen_template_source)); 129 } 130 131 std::string_view qfunction_source(qf_impl->qfunction_source); 132 std::string_view qfunction_name(qf_impl->qfunction_name); 133 const std::string operator_name = "CeedKernelSyclGenOperator_" + std::string(qfunction_name); 134 135 // Find dim, P_1d, Q_1d 136 impl->max_P_1d = 0; 137 for (CeedInt i = 0; i < num_input_fields; i++) { 138 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 139 if (basis != CEED_BASIS_NONE) { 140 bool is_tensor; 141 142 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 143 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 144 145 // Collect dim, P_1d, and Q_1d 146 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 147 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 148 if (is_tensor) { 149 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 150 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 151 if (P_1d > impl->max_P_1d) impl->max_P_1d = P_1d; 152 } else { 153 // LCOV_EXCL_START 154 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 155 // LCOV_EXCL_STOP 156 } 157 } 158 } 159 // Check output bases for Q_1d, dim as well 160 // The only input basis might be CEED_BASIS_NONE 161 for (CeedInt i = 0; i < num_output_fields; i++) { 162 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 163 164 if (basis != CEED_BASIS_NONE) { 165 bool is_tensor; 166 167 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 168 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 169 170 // Collect Q_1d 171 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 172 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 173 if (is_tensor) { 174 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 175 } else { 176 // LCOV_EXCL_START 177 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 178 // LCOV_EXCL_STOP 179 } 180 } 181 } 182 impl->dim = dim; 183 impl->Q_1d = Q_1d; 184 185 // Only use 3D collocated gradient parallelization strategy when gradient is computed 186 // TODO: put in a function? 187 bool use_collograd_parallelization = false; 188 189 if (dim == 3) { 190 bool was_grad_found = false; 191 192 for (CeedInt i = 0; i < num_input_fields; i++) { 193 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 194 if (eval_mode == CEED_EVAL_GRAD) { 195 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 196 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 197 use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 198 was_grad_found = true; 199 } 200 } 201 for (CeedInt i = 0; i < num_output_fields; i++) { 202 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 203 if (eval_mode == CEED_EVAL_GRAD) { 204 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 205 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 206 use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 207 was_grad_found = true; 208 } 209 } 210 } 211 212 CeedInt block_sizes[3]; 213 CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes)); 214 215 // Define CEED_Q_VLA 216 code << "\n#undef CEED_Q_VLA\n"; 217 if (dim != 3 || use_collograd_parallelization) { 218 code << "#define CEED_Q_VLA 1\n\n"; 219 } else { 220 code << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 221 } 222 223 // Determine subgroup size based on supported sizes : Default : 16 (if supported) 224 std::vector allowed_sg_sizes = sycl_data->sycl_device.get_info<sycl::info::device::sub_group_sizes>(); 225 CeedInt sub_group_size_op = allowed_sg_sizes[allowed_sg_sizes.size() - 1]; 226 for (const auto &s : allowed_sg_sizes) { 227 if (s == 16) { 228 sub_group_size_op = s; 229 break; 230 } 231 } 232 233 code << qfunction_source; 234 235 // Kernel function 236 code << "\n// -----------------------------------------------------------------------------\n"; 237 code << "__attribute__((reqd_work_group_size(GROUP_SIZE_X, GROUP_SIZE_Y, GROUP_SIZE_Z), intel_reqd_sub_group_size(" << sub_group_size_op << ")))\n"; 238 code << "kernel void " << operator_name << "("; 239 code << "const CeedInt num_elem, "; 240 code << "global void* ctx, "; 241 code << "global const FieldsInt_Sycl* indices, "; 242 code << "global Fields_Sycl* fields, "; 243 code << "global const Fields_Sycl* B, "; 244 code << "global const Fields_Sycl* G, "; 245 code << "global const CeedScalar * restrict W"; 246 code << ") {\n"; 247 248 for (CeedInt i = 0; i < num_input_fields; i++) { 249 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 250 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 251 code << " global const CeedScalar* d_u_" << i << " = fields->inputs[" << i << "];\n"; 252 } 253 } 254 255 for (CeedInt i = 0; i < num_output_fields; i++) { 256 code << " global CeedScalar* d_v_" << i << " = fields->outputs[" << i << "];\n"; 257 } 258 259 // TODO: Convert these to defined constants to save on GRF 260 code << " const CeedInt DIM = " << dim << ";\n"; 261 code << " const CeedInt Q_1D = " << Q_1d << ";\n"; 262 263 const CeedInt scratch_size = block_sizes[0] * block_sizes[1] * block_sizes[2]; 264 code << " local CeedScalar scratch[" << scratch_size << "];\n"; 265 code << " local CeedScalar * elem_scratch = scratch + get_local_id(2) * T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n"; 266 267 code << "\n // -- Input field constants and basis data --\n"; 268 // Initialize constants, and matrices B and G 269 for (CeedInt i = 0; i < num_input_fields; i++) { 270 code << " // ---- Input field " << i << " ----\n"; 271 // Get elem_size, eval_mode, num_comp 272 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 273 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 274 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 275 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 276 277 // Set field constants 278 if (eval_mode != CEED_EVAL_WEIGHT) { 279 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 280 if (basis != CEED_BASIS_NONE) { 281 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 282 code << " const CeedInt P_in_" << i << " = " << P_1d << ";\n"; 283 } else { 284 code << " const CeedInt P_in_" << i << " = " << Q_1d << ";\n"; 285 } 286 code << " const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n"; 287 } 288 289 // Load basis data 290 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 291 switch (eval_mode) { 292 case CEED_EVAL_NONE: 293 break; 294 case CEED_EVAL_INTERP: 295 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 296 h_B.inputs[i] = basis_impl->d_interp_1d; 297 code << " local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; 298 code << " loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n"; 299 break; 300 case CEED_EVAL_GRAD: 301 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 302 h_B.inputs[i] = basis_impl->d_interp_1d; 303 code << " local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; 304 code << " loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n"; 305 if (use_collograd_parallelization) { 306 h_G.inputs[i] = basis_impl->d_collo_grad_1d; 307 code << " local CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n"; 308 code << " loadMatrix(Q_1D*Q_1D, G->inputs[" << i << "], s_G_in_" << i << ");\n"; 309 } else { 310 bool has_collo_grad = basis_impl->d_collo_grad_1d; 311 h_G.inputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d; 312 code << " local CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; 313 code << " loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_in_" + std::to_string(i))) << "*Q_1D, G->inputs[" << i << "], s_G_in_" << i 314 << ");\n"; 315 } 316 break; 317 case CEED_EVAL_WEIGHT: 318 break; // No action 319 case CEED_EVAL_DIV: 320 break; // TODO: Not implemented 321 case CEED_EVAL_CURL: 322 break; // TODO: Not implemented 323 } 324 } 325 326 code << "\n // -- Output field constants and basis data --\n"; 327 for (CeedInt i = 0; i < num_output_fields; i++) { 328 code << " // ---- Output field " << i << " ----\n"; 329 // Get elem_size, eval_mode, num_comp 330 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 331 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 332 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 333 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 334 335 // Set field constants 336 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 337 if (basis != CEED_BASIS_NONE) { 338 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 339 code << " const CeedInt P_out_" << i << " = " << P_1d << ";\n"; 340 } else { 341 code << " const CeedInt P_out_" << i << " = " << Q_1d << ";\n"; 342 } 343 code << " const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n"; 344 345 // Load basis data 346 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 347 switch (eval_mode) { 348 case CEED_EVAL_NONE: 349 break; // No action 350 case CEED_EVAL_INTERP: 351 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 352 h_B.outputs[i] = basis_impl->d_interp_1d; 353 code << " local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; 354 code << " loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n"; 355 break; 356 case CEED_EVAL_GRAD: 357 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 358 h_B.outputs[i] = basis_impl->d_interp_1d; 359 code << " local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; 360 code << " loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n"; 361 if (use_collograd_parallelization) { 362 h_G.outputs[i] = basis_impl->d_collo_grad_1d; 363 code << " local CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n"; 364 code << " loadMatrix(Q_1D*Q_1D, G->outputs[" << i << "], s_G_out_" << i << ");\n"; 365 } else { 366 bool has_collo_grad = basis_impl->d_collo_grad_1d; 367 h_G.outputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d; 368 code << " local CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; 369 code << " loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_out_" + std::to_string(i))) << "*Q_1D, G->outputs[" << i << "], s_G_out_" << i 370 << ");\n"; 371 } 372 break; 373 // LCOV_EXCL_START 374 case CEED_EVAL_WEIGHT: { 375 Ceed ceed; 376 377 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 378 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 379 break; // Should not occur 380 } 381 case CEED_EVAL_DIV: 382 case CEED_EVAL_CURL: { 383 Ceed ceed; 384 385 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 386 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 387 break; // Should not occur 388 } 389 // LCOV_EXCL_STOP 390 } 391 } 392 code << "\n // -- Element loop --\n"; 393 code << " work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"; 394 code << " {\n"; 395 // Input basis apply if needed 396 // Generate the correct eval mode code for each input 397 code << " // -- Input field restrictions and basis actions --\n"; 398 for (CeedInt i = 0; i < num_input_fields; i++) { 399 code << " // ---- Input field " << i << " ----\n"; 400 // Get elem_size, eval_mode, num_comp 401 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 402 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 403 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 404 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 405 406 // Restriction 407 if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { 408 bool is_strided; 409 410 code << " CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n"; 411 412 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 413 if (!is_strided) { 414 CeedInt comp_stride; 415 416 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 417 code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n"; 418 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 419 code << " // CompStride: " << comp_stride << "\n"; 420 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl)); 421 h_indices.inputs[i] = rstr_impl->d_ind; 422 code << " readDofsOffset" << dim << "d(num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ", num_elem, indices->inputs[" << i 423 << "], d_u_" << i << ", r_u_" << i << ");\n"; 424 } else { 425 bool has_backend_strides; 426 CeedInt num_elem; 427 428 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 429 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 430 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 431 432 if (!has_backend_strides) { 433 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 434 } 435 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 436 code << " readDofsStrided" << dim << "d(num_comp_in_" << i << ",P_in_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2] 437 << ", num_elem, d_u_" << i << ", r_u_" << i << ");\n"; 438 } 439 } 440 441 // Basis action 442 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 443 switch (eval_mode) { 444 case CEED_EVAL_NONE: 445 if (!use_collograd_parallelization) { 446 code << " private CeedScalar* r_t_" << i << " = r_u_" << i << ";\n"; 447 } 448 break; 449 case CEED_EVAL_INTERP: 450 code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n"; 451 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" << i 452 << ", r_t_" << i << ", elem_scratch);\n"; 453 break; 454 case CEED_EVAL_GRAD: 455 if (use_collograd_parallelization) { 456 code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n"; 457 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" 458 << i << ", r_t_" << i << ", elem_scratch);\n"; 459 } else { 460 CeedInt P_1d; 461 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 462 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 463 code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM*Q_1D];\n"; 464 code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_in_" << i 465 << ", P_in_" << i << ", Q_1D, r_u_" << i << (dim > 1 ? ", s_B_in_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_in_" << i 466 << ", r_t_" << i << ", elem_scratch);\n"; 467 } 468 break; 469 case CEED_EVAL_WEIGHT: 470 code << " CeedScalar r_t_" << i << "[Q_1D];\n"; 471 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 472 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 473 impl->W = basis_impl->d_q_weight_1d; 474 code << " Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(Q_1D, W, r_t_" << i << ");\n"; 475 break; // No action 476 case CEED_EVAL_DIV: 477 break; // TODO: Not implemented 478 case CEED_EVAL_CURL: 479 break; // TODO: Not implemented 480 } 481 } 482 483 // Q function 484 code << "\n // -- Output field setup --\n"; 485 for (CeedInt i = 0; i < num_output_fields; i++) { 486 code << "\n // ---- Output field " << i << " ----\n"; 487 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 488 if (eval_mode == CEED_EVAL_GRAD) { 489 if (use_collograd_parallelization) { 490 // Accumulator for gradient slices 491 code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n"; 492 code << " for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n"; 493 code << " for (CeedInt j = 0; j < Q_1D; ++j) {\n"; 494 code << " r_tt_" << i << "[j + i*Q_1D] = 0.0;\n"; 495 code << " }\n"; 496 code << " }\n"; 497 } else { 498 code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*DIM*Q_1D];\n"; 499 } 500 } 501 if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) { 502 code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n"; 503 } 504 } 505 // We treat quadrature points per slice in 3d to save registers 506 if (use_collograd_parallelization) { 507 code << "\n // Note: Using planes of 3D elements\n"; 508 code << " for (CeedInt q = 0; q < Q_1D; q++) {\n"; 509 code << " // -- Input fields --\n"; 510 for (CeedInt i = 0; i < num_input_fields; i++) { 511 code << " // ---- Input field " << i << " ----\n"; 512 // Get elem_size, eval_mode, num_comp 513 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 514 // Basis action 515 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 516 switch (eval_mode) { 517 case CEED_EVAL_NONE: 518 bool is_strided; 519 520 code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; 521 522 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 523 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 524 if (!is_strided) { 525 CeedInt comp_stride; 526 527 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 528 code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n"; 529 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 530 code << " // CompStride: " << comp_stride << "\n"; 531 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl)); 532 h_indices.inputs[i] = rstr_impl->d_ind; 533 code << " readSliceQuadsOffset" 534 << "3d(num_comp_in_" << i << ", " << comp_stride << ", Q_1D, l_size_in_" << i << ", num_elem, q, indices->inputs[" << i << "], d_u_" 535 << i << ", r_q_" << i << ");\n"; 536 } else { 537 bool has_backend_strides; 538 CeedInt num_elem; 539 540 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 541 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 542 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 543 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 544 545 if (!has_backend_strides) { 546 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 547 } 548 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 549 code << " readSliceQuadsStrided" 550 << "3d(num_comp_in_" << i << ", Q_1D," << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i 551 << ", r_q_" << i << ");\n"; 552 } 553 break; 554 case CEED_EVAL_INTERP: 555 code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; 556 code << " for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n"; 557 code << " r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1D];\n"; 558 code << " }\n"; 559 break; 560 case CEED_EVAL_GRAD: 561 code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "*DIM];\n"; 562 code << " gradCollo3d(num_comp_in_" << i << ", Q_1D, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ", elem_scratch);\n"; 563 break; 564 case CEED_EVAL_WEIGHT: 565 code << " CeedScalar r_q_" << i << "[1];\n"; 566 code << " r_q_" << i << "[0] = r_t_" << i << "[q];\n"; 567 break; // No action 568 case CEED_EVAL_DIV: 569 break; // TODO: Not implemented 570 case CEED_EVAL_CURL: 571 break; // TODO: Not implemented 572 } 573 } 574 code << "\n // -- Output fields --\n"; 575 for (CeedInt i = 0; i < num_output_fields; i++) { 576 code << " // ---- Output field " << i << " ----\n"; 577 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 578 // Basis action 579 switch (eval_mode) { 580 case CEED_EVAL_NONE: 581 code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n"; 582 break; // No action 583 case CEED_EVAL_INTERP: 584 code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n"; 585 break; 586 case CEED_EVAL_GRAD: 587 code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*DIM];\n"; 588 break; 589 case CEED_EVAL_WEIGHT: 590 break; // Should not occur 591 case CEED_EVAL_DIV: 592 break; // TODO: Not implemented 593 case CEED_EVAL_CURL: 594 break; // TODO: Not implemented 595 } 596 } 597 } else { 598 code << "\n // Note: Using full elements\n"; 599 code << " // -- Input fields --\n"; 600 for (CeedInt i = 0; i < num_input_fields; i++) { 601 code << " // ---- Input field " << i << " ----\n"; 602 code << " private CeedScalar* r_q_" << i << " = r_t_" << i << ";\n"; 603 } 604 code << " // -- Output fields --\n"; 605 for (CeedInt i = 0; i < num_output_fields; i++) { 606 code << " // ---- Output field " << i << " ----\n"; 607 code << " private CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n"; 608 } 609 } 610 //-------------------------------------------------- 611 code << "\n // -- QFunction Inputs and outputs --\n"; 612 code << " const CeedScalar * in[" << num_input_fields << "];\n"; 613 for (CeedInt i = 0; i < num_input_fields; i++) { 614 code << " // ---- Input field " << i << " ----\n"; 615 code << " in[" << i << "] = r_q_" << i << ";\n"; 616 } 617 code << " CeedScalar * out[" << num_output_fields << "];\n"; 618 for (CeedInt i = 0; i < num_output_fields; i++) { 619 code << " // ---- Output field " << i << " ----\n"; 620 code << " out[" << i << "] = r_qq_" << i << ";\n"; 621 } 622 623 code << "\n // -- Apply QFunction --\n"; 624 code << " " << qfunction_name << "(ctx, "; 625 if (dim != 3 || use_collograd_parallelization) { 626 code << "1"; 627 } else { 628 code << "Q_1D"; 629 } 630 code << ", in, out);\n"; 631 //-------------------------------------------------- 632 633 if (use_collograd_parallelization) { 634 code << " // -- Output fields --\n"; 635 for (CeedInt i = 0; i < num_output_fields; i++) { 636 code << " // ---- Output field " << i << " ----\n"; 637 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 638 // Basis action 639 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 640 switch (eval_mode) { 641 case CEED_EVAL_NONE: 642 code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n"; 643 code << " r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n"; 644 code << " }\n"; 645 break; // No action 646 case CEED_EVAL_INTERP: 647 code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n"; 648 code << " r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n"; 649 code << " }\n"; 650 break; 651 case CEED_EVAL_GRAD: 652 code << " gradColloTranspose3d(num_comp_out_" << i << ",Q_1D, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i 653 << ", elem_scratch);\n"; 654 break; 655 case CEED_EVAL_WEIGHT: 656 break; // Should not occur 657 case CEED_EVAL_DIV: 658 break; // TODO: Not implemented 659 case CEED_EVAL_CURL: 660 break; // TODO: Not implemented 661 } 662 } 663 code << " }\n"; 664 } 665 666 // Output basis apply if needed 667 // Generate the correct eval mode code for each output 668 code << "\n // -- Output field basis action and restrictions --\n"; 669 for (CeedInt i = 0; i < num_output_fields; i++) { 670 code << " // ---- Output field " << i << " ----\n"; 671 // Get elem_size, eval_mode, num_comp 672 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 673 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 674 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 675 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 676 // Basis action 677 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 678 switch (eval_mode) { 679 case CEED_EVAL_NONE: 680 code << " private CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n"; 681 break; // No action 682 case CEED_EVAL_INTERP: 683 code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; 684 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i 685 << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n"; 686 break; 687 case CEED_EVAL_GRAD: 688 code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; 689 if (use_collograd_parallelization) { 690 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i 691 << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n"; 692 } else { 693 CeedInt P_1d; 694 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 695 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 696 code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_out_" << i 697 << ", P_out_" << i << ", Q_1D, r_tt_" << i << (dim > 1 ? ", s_B_out_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_out_" << i 698 << ", r_v_" << i << ", elem_scratch);\n"; 699 } 700 break; 701 // LCOV_EXCL_START 702 case CEED_EVAL_WEIGHT: { 703 Ceed ceed; 704 705 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 706 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 707 break; // Should not occur 708 } 709 case CEED_EVAL_DIV: 710 case CEED_EVAL_CURL: { 711 Ceed ceed; 712 713 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 714 return CeedError(ceed, CEED_ERROR_BACKEND, "%s not supported", CeedEvalModes[eval_mode]); 715 break; // Should not occur 716 } 717 // LCOV_EXCL_STOP 718 } 719 // Restriction 720 bool is_strided; 721 722 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 723 if (!is_strided) { 724 CeedInt comp_stride; 725 726 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 727 code << " const CeedInt l_size_out_" << i << " = " << l_size << ";\n"; 728 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 729 code << " // CompStride: " << comp_stride << "\n"; 730 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl)); 731 h_indices.outputs[i] = rstr_impl->d_ind; 732 code << " writeDofsOffset" << dim << "d(num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ", num_elem, indices->outputs[" << i 733 << "], r_v_" << i << ", d_v_" << i << ");\n"; 734 } else { 735 bool has_backend_strides; 736 CeedInt num_elem; 737 738 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 739 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 740 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 741 742 if (!has_backend_strides) { 743 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 744 } 745 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 746 code << " writeDofsStrided" << dim << "d(num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2] 747 << ", num_elem, r_v_" << i << ", d_v_" << i << ");\n"; 748 } 749 } 750 751 code << " }\n"; 752 code << "}\n"; 753 code << "// -----------------------------------------------------------------------------\n\n"; 754 755 // Copy the struct (containing device addresses) from the host to the device 756 sycl::event copy_B = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1); 757 sycl::event copy_G = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1); 758 sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1); 759 // These copies can happen while the JIT is being done 760 CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_B, copy_G, copy_indices})); 761 762 // View kernel for debugging 763 CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 764 CeedDebug(ceed, code.str().c_str()); 765 766 std::map<std::string, CeedInt> jit_constants; 767 jit_constants["T_1D"] = block_sizes[0]; 768 jit_constants["GROUP_SIZE_X"] = block_sizes[0]; 769 jit_constants["GROUP_SIZE_Y"] = block_sizes[1]; 770 jit_constants["GROUP_SIZE_Z"] = block_sizes[2]; 771 772 // Compile kernel into a kernel bundle 773 CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module, jit_constants)); 774 775 // Load kernel function 776 CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op)); 777 778 CeedCallBackend(CeedOperatorSetSetupDone(op)); 779 return CEED_ERROR_SUCCESS; 780 } 781 782 //------------------------------------------------------------------------------ 783