1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #define CEED_DEBUG_COLOR 12 9 10 #include <ceed/backend.h> 11 #include <ceed/ceed.h> 12 #include <ceed/jit-source/sycl/sycl-types.h> 13 #include <ceed/jit-tools.h> 14 15 #include <iostream> 16 #include <sstream> 17 #include <string> 18 #include <string_view> 19 #include <vector> 20 21 #include "../sycl-ref/ceed-sycl-ref.hpp" 22 #include "../sycl-shared/ceed-sycl-shared.hpp" 23 #include "../sycl/ceed-sycl-compile.hpp" 24 25 #include "ceed-sycl-gen.hpp" 26 27 //------------------------------------------------------------------------------ 28 // Calculate the block size used for launching the operator kernel 29 //------------------------------------------------------------------------------ 30 extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { 31 const CeedInt thread1d = CeedIntMax(Q_1d, P_1d); 32 33 if (dim == 1) { 34 CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64; 35 36 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 37 block_sizes[0] = thread1d; 38 block_sizes[1] = 1; 39 block_sizes[2] = elems_per_block; 40 } else if (dim == 2) { 41 const CeedInt elems_per_block = thread1d < 4 ? 16 : 2; 42 43 block_sizes[0] = thread1d; 44 block_sizes[1] = thread1d; 45 block_sizes[2] = elems_per_block; 46 } else if (dim == 3) { 47 const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); 48 49 block_sizes[0] = thread1d; 50 block_sizes[1] = thread1d; 51 block_sizes[2] = elems_per_block; 52 } 53 return CEED_ERROR_SUCCESS; 54 } 55 56 //------------------------------------------------------------------------------ 57 // Build single operator kernel 58 // - [ ] Check arguments to device functions reudsed from sycl-shared-basis are correct 59 // - [ ] Do kernel jitting! 60 //------------------------------------------------------------------------------ 61 extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) { 62 Ceed ceed; 63 Ceed_Sycl *sycl_data; 64 bool is_setup_done, is_identity_qf; 65 CeedSize l_size; 66 CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1; 67 Fields_Sycl h_B, h_G; 68 FieldsInt_Sycl h_indices; 69 CeedEvalMode eval_mode; 70 CeedElemRestriction elem_rstr; 71 CeedElemRestriction_Sycl *rstr_impl; 72 CeedBasis basis; 73 CeedBasis_Sycl_shared *basis_impl; 74 CeedQFunctionField *qf_input_fields, *qf_output_fields; 75 CeedQFunction_Sycl_gen *qf_impl; 76 CeedQFunction qf; 77 CeedOperatorField *op_input_fields, *op_output_fields; 78 CeedOperator_Sycl_gen *impl; 79 80 CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 81 if (is_setup_done) return CEED_ERROR_SUCCESS; 82 83 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 84 CeedCallBackend(CeedGetData(ceed, &sycl_data)); 85 86 CeedCallBackend(CeedOperatorGetData(op, &impl)); 87 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 88 CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl)); 89 CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 90 Q_1d = Q; 91 92 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 93 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 94 95 // Check for restriction only identity operator 96 CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 97 if (is_identity_qf) { 98 CeedEvalMode eval_mode_in, eval_mode_out; 99 100 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); 101 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); 102 if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) { 103 // LCOV_EXCL_START 104 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement restriction only identity operators"); 105 // LCOV_EXCL_STOP 106 } 107 } 108 109 std::ostringstream code; 110 // TODO: generalize to accept different device functions? 111 { 112 char *tensor_basis_kernel_path, *tensor_basis_code; 113 114 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h", &tensor_basis_kernel_path)); 115 CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n"); 116 CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_code)); 117 code << tensor_basis_code; 118 CeedCallBackend(CeedFree(&tensor_basis_kernel_path)); 119 CeedCallBackend(CeedFree(&tensor_basis_code)); 120 } 121 { 122 char *sycl_gen_template_path, *sycl_gen_template_source; 123 124 CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-gen-templates.h", &sycl_gen_template_path)); 125 CeedDebug256(ceed, 2, "----- Loading Sycl-Gen Template Source -----\n"); 126 CeedCallBackend(CeedLoadSourceToBuffer(ceed, sycl_gen_template_path, &sycl_gen_template_source)); 127 code << sycl_gen_template_source; 128 CeedCallBackend(CeedFree(&sycl_gen_template_path)); 129 CeedCallBackend(CeedFree(&sycl_gen_template_source)); 130 } 131 132 std::string_view q_function_source(qf_impl->q_function_source); 133 std::string_view q_function_name(qf_impl->q_function_name); 134 const std::string operator_name = "CeedKernelSyclGenOperator_" + std::string(q_function_name); 135 136 // Find dim, P_1d, Q_1d 137 impl->max_P_1d = 0; 138 for (CeedInt i = 0; i < num_input_fields; i++) { 139 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 140 if (basis != CEED_BASIS_NONE) { 141 bool is_tensor; 142 143 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 144 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 145 146 // Collect dim, P_1d, and Q_1d 147 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 148 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 149 if (is_tensor) { 150 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 151 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 152 if (P_1d > impl->max_P_1d) impl->max_P_1d = P_1d; 153 } else { 154 // LCOV_EXCL_START 155 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 156 // LCOV_EXCL_STOP 157 } 158 } 159 } 160 // Check output bases for Q_1d, dim as well 161 // The only input basis might be CEED_BASIS_NONE 162 for (CeedInt i = 0; i < num_output_fields; i++) { 163 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 164 165 if (basis != CEED_BASIS_NONE) { 166 bool is_tensor; 167 168 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 169 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 170 171 // Collect Q_1d 172 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 173 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 174 if (is_tensor) { 175 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); 176 } else { 177 // LCOV_EXCL_START 178 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 179 // LCOV_EXCL_STOP 180 } 181 } 182 } 183 impl->dim = dim; 184 impl->Q_1d = Q_1d; 185 186 // Only use 3D collocated gradient parallelization strategy when gradient is computed 187 // TODO: put in a function? 188 bool use_collograd_parallelization = false; 189 190 if (dim == 3) { 191 bool was_grad_found = false; 192 193 for (CeedInt i = 0; i < num_input_fields; i++) { 194 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 195 if (eval_mode == CEED_EVAL_GRAD) { 196 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 197 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 198 use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 199 was_grad_found = true; 200 } 201 } 202 for (CeedInt i = 0; i < num_output_fields; i++) { 203 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 204 if (eval_mode == CEED_EVAL_GRAD) { 205 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 206 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 207 use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 208 was_grad_found = true; 209 } 210 } 211 } 212 213 CeedInt block_sizes[3]; 214 CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes)); 215 216 // Define CEED_Q_VLA 217 code << "\n#undef CEED_Q_VLA\n"; 218 if (dim != 3 || use_collograd_parallelization) { 219 code << "#define CEED_Q_VLA 1\n\n"; 220 } else { 221 code << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 222 } 223 224 // Determine subgroup size based on supported sizes : Default : 16 (if supported) 225 std::vector allowed_sg_sizes = sycl_data->sycl_device.get_info<sycl::info::device::sub_group_sizes>(); 226 CeedInt sub_group_size_op = allowed_sg_sizes[allowed_sg_sizes.size() - 1]; 227 for (const auto &s : allowed_sg_sizes) { 228 if (s == 16) { 229 sub_group_size_op = s; 230 break; 231 } 232 } 233 234 code << q_function_source; 235 236 // Kernel function 237 code << "\n// -----------------------------------------------------------------------------\n"; 238 code << "__attribute__((reqd_work_group_size(GROUP_SIZE_X, GROUP_SIZE_Y, GROUP_SIZE_Z), intel_reqd_sub_group_size(" << sub_group_size_op << ")))\n"; 239 code << "kernel void " << operator_name << "("; 240 code << "const CeedInt num_elem, "; 241 code << "global void* ctx, "; 242 code << "global const FieldsInt_Sycl* indices, "; 243 code << "global Fields_Sycl* fields, "; 244 code << "global const Fields_Sycl* B, "; 245 code << "global const Fields_Sycl* G, "; 246 code << "global const CeedScalar * restrict W"; 247 code << ") {\n"; 248 249 for (CeedInt i = 0; i < num_input_fields; i++) { 250 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 251 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 252 code << " global const CeedScalar* d_u_" << i << " = fields->inputs[" << i << "];\n"; 253 } 254 } 255 256 for (CeedInt i = 0; i < num_output_fields; i++) { 257 code << " global CeedScalar* d_v_" << i << " = fields->outputs[" << i << "];\n"; 258 } 259 260 // TODO: Convert these to defined constants to save on GRF 261 code << " const CeedInt DIM = " << dim << ";\n"; 262 code << " const CeedInt Q_1D = " << Q_1d << ";\n"; 263 264 const CeedInt scratch_size = block_sizes[0] * block_sizes[1] * block_sizes[2]; 265 code << " local CeedScalar scratch[" << scratch_size << "];\n"; 266 code << " local CeedScalar * elem_scratch = scratch + get_local_id(2) * T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n"; 267 268 code << "\n // -- Input field constants and basis data --\n"; 269 // Initialize constants, and matrices B and G 270 for (CeedInt i = 0; i < num_input_fields; i++) { 271 code << " // ---- Input field " << i << " ----\n"; 272 // Get elem_size, eval_mode, num_comp 273 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 274 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 275 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 276 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 277 278 // Set field constants 279 if (eval_mode != CEED_EVAL_WEIGHT) { 280 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 281 if (basis != CEED_BASIS_NONE) { 282 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 283 code << " const CeedInt P_in_" << i << " = " << P_1d << ";\n"; 284 } else { 285 code << " const CeedInt P_in_" << i << " = " << Q_1d << ";\n"; 286 } 287 code << " const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n"; 288 } 289 290 // Load basis data 291 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 292 switch (eval_mode) { 293 case CEED_EVAL_NONE: 294 break; 295 case CEED_EVAL_INTERP: 296 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 297 h_B.inputs[i] = basis_impl->d_interp_1d; 298 code << " local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; 299 code << " loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n"; 300 break; 301 case CEED_EVAL_GRAD: 302 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 303 h_B.inputs[i] = basis_impl->d_interp_1d; 304 code << " local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n"; 305 code << " loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n"; 306 if (use_collograd_parallelization) { 307 h_G.inputs[i] = basis_impl->d_collo_grad_1d; 308 code << " local CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n"; 309 code << " loadMatrix(Q_1D*Q_1D, G->inputs[" << i << "], s_G_in_" << i << ");\n"; 310 } else { 311 bool has_collo_grad = basis_impl->d_collo_grad_1d; 312 h_G.inputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d; 313 code << " local CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; 314 code << " loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_in_" + std::to_string(i))) << "*Q_1D, G->inputs[" << i << "], s_G_in_" << i 315 << ");\n"; 316 } 317 break; 318 case CEED_EVAL_WEIGHT: 319 break; // No action 320 case CEED_EVAL_DIV: 321 break; // TODO: Not implemented 322 case CEED_EVAL_CURL: 323 break; // TODO: Not implemented 324 } 325 } 326 327 code << "\n // -- Output field constants and basis data --\n"; 328 for (CeedInt i = 0; i < num_output_fields; i++) { 329 code << " // ---- Output field " << i << " ----\n"; 330 // Get elem_size, eval_mode, num_comp 331 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 332 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 333 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 334 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 335 336 // Set field constants 337 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 338 if (basis != CEED_BASIS_NONE) { 339 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 340 code << " const CeedInt P_out_" << i << " = " << P_1d << ";\n"; 341 } else { 342 code << " const CeedInt P_out_" << i << " = " << Q_1d << ";\n"; 343 } 344 code << " const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n"; 345 346 // Load basis data 347 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 348 switch (eval_mode) { 349 case CEED_EVAL_NONE: 350 break; // No action 351 case CEED_EVAL_INTERP: 352 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 353 h_B.outputs[i] = basis_impl->d_interp_1d; 354 code << " local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; 355 code << " loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n"; 356 break; 357 case CEED_EVAL_GRAD: 358 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 359 h_B.outputs[i] = basis_impl->d_interp_1d; 360 code << " local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n"; 361 code << " loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n"; 362 if (use_collograd_parallelization) { 363 h_G.outputs[i] = basis_impl->d_collo_grad_1d; 364 code << " local CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n"; 365 code << " loadMatrix(Q_1D*Q_1D, G->outputs[" << i << "], s_G_out_" << i << ");\n"; 366 } else { 367 bool has_collo_grad = basis_impl->d_collo_grad_1d; 368 h_G.outputs[i] = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d; 369 code << " local CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n"; 370 code << " loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_out_" + std::to_string(i))) << "*Q_1D, G->outputs[" << i << "], s_G_out_" << i 371 << ");\n"; 372 } 373 break; 374 // LCOV_EXCL_START 375 case CEED_EVAL_WEIGHT: { 376 Ceed ceed; 377 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 378 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 379 break; // Should not occur 380 } 381 case CEED_EVAL_DIV: 382 break; // TODO: Not implemented 383 case CEED_EVAL_CURL: 384 break; // TODO: Not implemented 385 // LCOV_EXCL_STOP 386 } 387 } 388 code << "\n // -- Element loop --\n"; 389 code << " work_group_barrier(CLK_LOCAL_MEM_FENCE);\n"; 390 code << " {\n"; 391 // Input basis apply if needed 392 // Generate the correct eval mode code for each input 393 code << " // -- Input field restrictions and basis actions --\n"; 394 for (CeedInt i = 0; i < num_input_fields; i++) { 395 code << " // ---- Input field " << i << " ----\n"; 396 // Get elem_size, eval_mode, num_comp 397 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 398 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 399 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 400 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 401 402 // Restriction 403 if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { 404 bool is_strided; 405 406 code << " CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n"; 407 408 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 409 if (!is_strided) { 410 CeedInt comp_stride; 411 412 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 413 code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n"; 414 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 415 code << " // CompStride: " << comp_stride << "\n"; 416 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl)); 417 h_indices.inputs[i] = rstr_impl->d_ind; 418 code << " readDofsOffset" << dim << "d(num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ", num_elem, indices->inputs[" << i 419 << "], d_u_" << i << ", r_u_" << i << ");\n"; 420 } else { 421 bool has_backend_strides; 422 CeedInt num_elem; 423 424 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 425 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 426 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 427 428 if (!has_backend_strides) { 429 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 430 } 431 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 432 code << " readDofsStrided" << dim << "d(num_comp_in_" << i << ",P_in_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2] 433 << ", num_elem, d_u_" << i << ", r_u_" << i << ");\n"; 434 } 435 } 436 437 // Basis action 438 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 439 switch (eval_mode) { 440 case CEED_EVAL_NONE: 441 if (!use_collograd_parallelization) { 442 code << " private CeedScalar* r_t_" << i << " = r_u_" << i << ";\n"; 443 } 444 break; 445 case CEED_EVAL_INTERP: 446 code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n"; 447 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" << i 448 << ", r_t_" << i << ", elem_scratch);\n"; 449 break; 450 case CEED_EVAL_GRAD: 451 if (use_collograd_parallelization) { 452 code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n"; 453 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" 454 << i << ", r_t_" << i << ", elem_scratch);\n"; 455 } else { 456 CeedInt P_1d; 457 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 458 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 459 code << " CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM*Q_1D];\n"; 460 code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_in_" << i 461 << ", P_in_" << i << ", Q_1D, r_u_" << i << (dim > 1 ? ", s_B_in_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_in_" << i 462 << ", r_t_" << i << ", elem_scratch);\n"; 463 } 464 break; 465 case CEED_EVAL_WEIGHT: 466 code << " CeedScalar r_t_" << i << "[Q_1D];\n"; 467 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 468 CeedCallBackend(CeedBasisGetData(basis, &basis_impl)); 469 impl->W = basis_impl->d_q_weight_1d; 470 code << " Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(Q_1D, W, r_t_" << i << ");\n"; 471 break; // No action 472 case CEED_EVAL_DIV: 473 break; // TODO: Not implemented 474 case CEED_EVAL_CURL: 475 break; // TODO: Not implemented 476 } 477 } 478 479 // Q function 480 code << "\n // -- Output field setup --\n"; 481 for (CeedInt i = 0; i < num_output_fields; i++) { 482 code << "\n // ---- Output field " << i << " ----\n"; 483 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 484 if (eval_mode == CEED_EVAL_GRAD) { 485 if (use_collograd_parallelization) { 486 // Accumulator for gradient slices 487 code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n"; 488 code << " for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n"; 489 code << " for (CeedInt j = 0; j < Q_1D; ++j) {\n"; 490 code << " r_tt_" << i << "[j + i*Q_1D] = 0.0;\n"; 491 code << " }\n"; 492 code << " }\n"; 493 } else { 494 code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*DIM*Q_1D];\n"; 495 } 496 } 497 if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) { 498 code << " CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n"; 499 } 500 } 501 // We treat quadrature points per slice in 3d to save registers 502 if (use_collograd_parallelization) { 503 code << "\n // Note: Using planes of 3D elements\n"; 504 code << " for (CeedInt q = 0; q < Q_1D; q++) {\n"; 505 code << " // -- Input fields --\n"; 506 for (CeedInt i = 0; i < num_input_fields; i++) { 507 code << " // ---- Input field " << i << " ----\n"; 508 // Get elem_size, eval_mode, num_comp 509 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 510 // Basis action 511 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 512 switch (eval_mode) { 513 case CEED_EVAL_NONE: 514 bool is_strided; 515 516 code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; 517 518 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 519 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 520 if (!is_strided) { 521 CeedInt comp_stride; 522 523 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 524 code << " const CeedInt l_size_in_" << i << " = " << l_size << ";\n"; 525 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 526 code << " // CompStride: " << comp_stride << "\n"; 527 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl)); 528 h_indices.inputs[i] = rstr_impl->d_ind; 529 code << " readSliceQuadsOffset" 530 << "3d(num_comp_in_" << i << ", " << comp_stride << ", Q_1D, l_size_in_" << i << ", num_elem, q, indices->inputs[" << i << "], d_u_" 531 << i << ", r_q_" << i << ");\n"; 532 } else { 533 bool has_backend_strides; 534 CeedInt num_elem; 535 536 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 537 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 538 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 539 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 540 541 if (!has_backend_strides) { 542 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 543 } 544 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 545 code << " readSliceQuadsStrided" 546 << "3d(num_comp_in_" << i << ", Q_1D," << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i 547 << ", r_q_" << i << ");\n"; 548 } 549 break; 550 case CEED_EVAL_INTERP: 551 code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n"; 552 code << " for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n"; 553 code << " r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1D];\n"; 554 code << " }\n"; 555 break; 556 case CEED_EVAL_GRAD: 557 code << " CeedScalar r_q_" << i << "[num_comp_in_" << i << "*DIM];\n"; 558 code << " gradCollo3d(num_comp_in_" << i << ", Q_1D, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ", elem_scratch);\n"; 559 break; 560 case CEED_EVAL_WEIGHT: 561 code << " CeedScalar r_q_" << i << "[1];\n"; 562 code << " r_q_" << i << "[0] = r_t_" << i << "[q];\n"; 563 break; // No action 564 case CEED_EVAL_DIV: 565 break; // TODO: Not implemented 566 case CEED_EVAL_CURL: 567 break; // TODO: Not implemented 568 } 569 } 570 code << "\n // -- Output fields --\n"; 571 for (CeedInt i = 0; i < num_output_fields; i++) { 572 code << " // ---- Output field " << i << " ----\n"; 573 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 574 // Basis action 575 switch (eval_mode) { 576 case CEED_EVAL_NONE: 577 code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n"; 578 break; // No action 579 case CEED_EVAL_INTERP: 580 code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n"; 581 break; 582 case CEED_EVAL_GRAD: 583 code << " CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*DIM];\n"; 584 break; 585 case CEED_EVAL_WEIGHT: 586 break; // Should not occur 587 case CEED_EVAL_DIV: 588 break; // TODO: Not implemented 589 case CEED_EVAL_CURL: 590 break; // TODO: Not implemented 591 } 592 } 593 } else { 594 code << "\n // Note: Using full elements\n"; 595 code << " // -- Input fields --\n"; 596 for (CeedInt i = 0; i < num_input_fields; i++) { 597 code << " // ---- Input field " << i << " ----\n"; 598 code << " private CeedScalar* r_q_" << i << " = r_t_" << i << ";\n"; 599 } 600 code << " // -- Output fields --\n"; 601 for (CeedInt i = 0; i < num_output_fields; i++) { 602 code << " // ---- Output field " << i << " ----\n"; 603 code << " private CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n"; 604 } 605 } 606 //-------------------------------------------------- 607 code << "\n // -- QFunction Inputs and outputs --\n"; 608 code << " const CeedScalar * in[" << num_input_fields << "];\n"; 609 for (CeedInt i = 0; i < num_input_fields; i++) { 610 code << " // ---- Input field " << i << " ----\n"; 611 code << " in[" << i << "] = r_q_" << i << ";\n"; 612 } 613 code << " CeedScalar * out[" << num_output_fields << "];\n"; 614 for (CeedInt i = 0; i < num_output_fields; i++) { 615 code << " // ---- Output field " << i << " ----\n"; 616 code << " out[" << i << "] = r_qq_" << i << ";\n"; 617 } 618 619 code << "\n // -- Apply QFunction --\n"; 620 code << " " << q_function_name << "(ctx, "; 621 if (dim != 3 || use_collograd_parallelization) { 622 code << "1"; 623 } else { 624 code << "Q_1D"; 625 } 626 code << ", in, out);\n"; 627 //-------------------------------------------------- 628 629 if (use_collograd_parallelization) { 630 code << " // -- Output fields --\n"; 631 for (CeedInt i = 0; i < num_output_fields; i++) { 632 code << " // ---- Output field " << i << " ----\n"; 633 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 634 // Basis action 635 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 636 switch (eval_mode) { 637 case CEED_EVAL_NONE: 638 code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n"; 639 code << " r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n"; 640 code << " }\n"; 641 break; // No action 642 case CEED_EVAL_INTERP: 643 code << " for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n"; 644 code << " r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n"; 645 code << " }\n"; 646 break; 647 case CEED_EVAL_GRAD: 648 code << " gradColloTranspose3d(num_comp_out_" << i << ",Q_1D, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i 649 << ", elem_scratch);\n"; 650 break; 651 case CEED_EVAL_WEIGHT: 652 break; // Should not occur 653 case CEED_EVAL_DIV: 654 break; // TODO: Not implemented 655 case CEED_EVAL_CURL: 656 break; // TODO: Not implemented 657 } 658 } 659 code << " }\n"; 660 } 661 662 // Output basis apply if needed 663 // Generate the correct eval mode code for each output 664 code << "\n // -- Output field basis action and restrictions --\n"; 665 for (CeedInt i = 0; i < num_output_fields; i++) { 666 code << " // ---- Output field " << i << " ----\n"; 667 // Get elem_size, eval_mode, num_comp 668 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 669 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 670 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 671 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 672 // Basis action 673 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 674 switch (eval_mode) { 675 case CEED_EVAL_NONE: 676 code << " private CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n"; 677 break; // No action 678 case CEED_EVAL_INTERP: 679 code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; 680 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i 681 << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n"; 682 break; 683 case CEED_EVAL_GRAD: 684 code << " CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n"; 685 if (use_collograd_parallelization) { 686 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i 687 << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n"; 688 } else { 689 CeedInt P_1d; 690 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 691 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 692 code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_out_" << i 693 << ", P_out_" << i << ", Q_1D, r_tt_" << i << (dim > 1 ? ", s_B_out_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_out_" << i 694 << ", r_v_" << i << ", elem_scratch);\n"; 695 } 696 break; 697 // LCOV_EXCL_START 698 case CEED_EVAL_WEIGHT: { 699 Ceed ceed; 700 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 701 return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 702 break; // Should not occur 703 } 704 case CEED_EVAL_DIV: 705 break; // TODO: Not implemented 706 case CEED_EVAL_CURL: 707 break; // TODO: Not implemented 708 // LCOV_EXCL_STOP 709 } 710 // Restriction 711 bool is_strided; 712 713 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 714 if (!is_strided) { 715 CeedInt comp_stride; 716 717 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 718 code << " const CeedInt l_size_out_" << i << " = " << l_size << ";\n"; 719 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 720 code << " // CompStride: " << comp_stride << "\n"; 721 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_impl)); 722 h_indices.outputs[i] = rstr_impl->d_ind; 723 code << " writeDofsOffset" << dim << "d(num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ", num_elem, indices->outputs[" << i 724 << "], r_v_" << i << ", d_v_" << i << ");\n"; 725 } else { 726 bool has_backend_strides; 727 CeedInt num_elem; 728 729 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 730 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 731 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 732 733 if (!has_backend_strides) { 734 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 735 } 736 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 737 code << " writeDofsStrided" << dim << "d(num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2] 738 << ", num_elem, r_v_" << i << ", d_v_" << i << ");\n"; 739 } 740 } 741 742 code << " }\n"; 743 code << "}\n"; 744 code << "// -----------------------------------------------------------------------------\n\n"; 745 746 // Copy the struct (containing device addresses) from the host to the device 747 sycl::event copy_B = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1); 748 sycl::event copy_G = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1); 749 sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1); 750 // These copies can happen while the JIT is being done 751 CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_B, copy_G, copy_indices})); 752 753 // View kernel for debugging 754 CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 755 CeedDebug(ceed, code.str().c_str()); 756 757 std::map<std::string, CeedInt> jit_constants; 758 jit_constants["T_1D"] = block_sizes[0]; 759 jit_constants["GROUP_SIZE_X"] = block_sizes[0]; 760 jit_constants["GROUP_SIZE_Y"] = block_sizes[1]; 761 jit_constants["GROUP_SIZE_Z"] = block_sizes[2]; 762 763 // Compile kernel into a kernel bundle 764 CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module, jit_constants)); 765 766 // Load kernel function 767 CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op)); 768 769 CeedCallBackend(CeedOperatorSetSetupDone(op)); 770 return CEED_ERROR_SUCCESS; 771 } 772 773 //------------------------------------------------------------------------------ 774