1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #define CEED_DEBUG_COLOR 12 9 10 #include <ceed/ceed.h> 11 #include <ceed/backend.h> 12 #include <ceed/jit-tools.h> 13 #include <iostream> 14 #include <string> 15 #include <sstream> 16 #include "ceed-hip-gen.h" 17 #include "../hip-ref/ceed-hip-ref.h" 18 #include "../hip-shared/ceed-hip-shared.h" 19 #include "../hip/ceed-hip-compile.h" 20 21 22 //------------------------------------------------------------------------------ 23 // Calculate the block size used for launching the operator kernel 24 //------------------------------------------------------------------------------ 25 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, 26 const CeedInt P_1d, const CeedInt Q_1d, 27 CeedInt *block_sizes) { 28 29 const CeedInt thread1d = CeedIntMax(Q_1d, P_1d); 30 if (dim==1) { 31 CeedInt elems_per_block = 64*thread1d > 256? 256/thread1d : 64; 32 elems_per_block = elems_per_block>0?elems_per_block:1; 33 block_sizes[0] = thread1d; 34 block_sizes[1] = 1; 35 block_sizes[2] = elems_per_block; 36 } else if (dim==2) { 37 const CeedInt elems_per_block = thread1d<4? 16 : 2; 38 block_sizes[0] = thread1d; 39 block_sizes[1] = thread1d; 40 block_sizes[2] = elems_per_block; 41 } else if (dim==3) { 42 const CeedInt elems_per_block = thread1d<6? 4 : (thread1d<8? 2 : 1); 43 block_sizes[0] = thread1d; 44 block_sizes[1] = thread1d; 45 block_sizes[2] = elems_per_block; 46 } 47 return CEED_ERROR_SUCCESS; 48 } 49 50 //------------------------------------------------------------------------------ 51 // Build single operator kernel 52 //------------------------------------------------------------------------------ 53 extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { 54 55 using std::ostringstream; 56 using std::string; 57 int ierr; 58 bool is_setup_done; 59 ierr = CeedOperatorIsSetupDone(op, &is_setup_done); CeedChkBackend(ierr); 60 if (is_setup_done) return CEED_ERROR_SUCCESS; 61 Ceed ceed; 62 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 63 CeedOperator_Hip_gen *data; 64 ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 65 CeedQFunction qf; 66 CeedQFunction_Hip_gen *qf_data; 67 ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 68 ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 69 CeedSize lsize; 70 CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, 71 num_output_fields, num_comp, dim = 1; 72 ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 73 Q_1d = Q; 74 CeedOperatorField *op_input_fields, *op_output_fields; 75 ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields); 76 CeedChkBackend(ierr); 77 CeedQFunctionField *qf_input_fields, *qf_output_fields; 78 ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields); 79 CeedChkBackend(ierr); 80 CeedEvalMode eval_mode; 81 CeedBasis basis; 82 CeedBasis_Hip_shared *basis_data; 83 CeedElemRestriction Erestrict; 84 CeedElemRestriction_Hip *restr_data; 85 86 // Check for restriction only identity operator 87 bool is_identity_qf; 88 ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 89 if (is_identity_qf) { 90 CeedEvalMode eval_mode_in, eval_mode_out; 91 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in); CeedChkBackend(ierr); 92 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out); CeedChkBackend(ierr); 93 if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) 94 // LCOV_EXCL_START 95 return CeedError(ceed, CEED_ERROR_BACKEND, 96 "Backend does not implement restriction only identity operators"); 97 // LCOV_EXCL_STOP 98 } 99 100 ostringstream code; 101 // TODO: generalize to accept different device functions? 102 { 103 char *tensor_basis_kernel_path, *tensor_basis_kernel_source; 104 ierr = CeedGetJitAbsolutePath(ceed, 105 "ceed/jit-source/hip/hip-shared-basis-tensor-templates.h", 106 &tensor_basis_kernel_path); CeedChkBackend(ierr); 107 CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n"); 108 ierr = CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source); 109 CeedChkBackend(ierr); 110 code << tensor_basis_kernel_source; 111 ierr = CeedFree(&tensor_basis_kernel_path); CeedChkBackend(ierr); 112 ierr = CeedFree(&tensor_basis_kernel_source); CeedChkBackend(ierr); 113 } 114 { 115 char *hip_gen_template_path, *hip_gen_template_source; 116 ierr = CeedGetJitAbsolutePath(ceed, 117 "ceed/jit-source/hip/hip-gen-templates.h", 118 &hip_gen_template_path); CeedChkBackend(ierr); 119 CeedDebug256(ceed, 2, "----- Loading Hip-Gen Template Source -----\n"); 120 ierr = CeedLoadSourceToBuffer(ceed, hip_gen_template_path, &hip_gen_template_source); 121 CeedChkBackend(ierr); 122 code << hip_gen_template_source; 123 ierr = CeedFree(&hip_gen_template_path); CeedChkBackend(ierr); 124 ierr = CeedFree(&hip_gen_template_source); CeedChkBackend(ierr); 125 } 126 127 string q_function_source(qf_data->q_function_source); 128 string q_function_name(qf_data->q_function_name); 129 string operator_name; 130 operator_name = "CeedKernel_Hip_gen_" + q_function_name; 131 132 // Find dim, P_1d, Q_1d 133 data->max_P_1d = 0; 134 for (CeedInt i = 0; i < num_input_fields; i++) { 135 ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 136 if (basis != CEED_BASIS_COLLOCATED) { 137 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 138 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 139 CeedChkBackend(ierr); 140 141 // Collect dim, P_1d, and Q_1d 142 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 143 bool isTensor; 144 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 145 if (isTensor) { 146 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 147 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 148 if (P_1d > data->max_P_1d) data->max_P_1d = P_1d; 149 } else { 150 // LCOV_EXCL_START 151 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 152 // LCOV_EXCL_STOP 153 } 154 } 155 } 156 // Check output bases for Q_1d, dim as well 157 // The only input basis might be CEED_BASIS_COLLOCATED 158 for (CeedInt i = 0; i < num_output_fields; i++) { 159 ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 160 161 if (basis != CEED_BASIS_COLLOCATED) { 162 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 163 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 164 CeedChkBackend(ierr); 165 166 // Collect Q_1d 167 ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 168 bool isTensor; 169 ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 170 if (isTensor) { 171 ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 172 } else { 173 // LCOV_EXCL_START 174 return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 175 // LCOV_EXCL_STOP 176 } 177 } 178 } 179 data->dim = dim; 180 data->Q_1d = Q_1d; 181 182 // Only use 3D collocated gradient parallelization strategy when gradient is computed 183 // TODO: put in a function? 184 bool use_collograd_parallelization = false; 185 if (dim == 3) { 186 bool was_grad_found = false; 187 for (CeedInt i = 0; i < num_input_fields; i++) { 188 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 189 if (eval_mode == CEED_EVAL_GRAD) { 190 ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 191 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 192 use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 193 was_grad_found = true; 194 } 195 } 196 for (CeedInt i = 0; i < num_output_fields; i++) { 197 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 198 if (eval_mode == CEED_EVAL_GRAD) { 199 ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 200 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 201 use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 202 was_grad_found = true; 203 } 204 } 205 } 206 207 // Define CEED_Q_VLA 208 code << "\n#undef CEED_Q_VLA\n"; 209 if (dim != 3 || use_collograd_parallelization) { 210 code << "#define CEED_Q_VLA 1\n\n"; 211 } else { 212 code << "#define CEED_Q_VLA "<<Q_1d<<"\n\n"; 213 } 214 215 code << q_function_source; 216 217 // Setup 218 code << "\n// -----------------------------------------------------------------------------\n"; 219 code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; 220 code << "__global__ void "<<operator_name<<"(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W) {\n"; 221 for (CeedInt i = 0; i < num_input_fields; i++) { 222 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 223 CeedChkBackend(ierr); 224 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 225 code << " const CeedScalar* d_u_" <<i<<" = fields.inputs["<<i<<"];\n"; 226 } 227 } 228 229 for (CeedInt i = 0; i < num_output_fields; i++) { 230 code << " CeedScalar* d_v_"<<i<<" = fields.outputs["<<i<<"];\n"; 231 } 232 233 code << " const CeedInt dim = "<<dim<<";\n"; 234 code << " const CeedInt Q_1d = "<<Q_1d<<";\n"; 235 236 code << " HIP_DYNAMIC_SHARED( CeedScalar, slice)\n"; 237 code << " SharedData_Hip data;\n"; 238 code << " data.t_id_x = threadIdx.x;\n"; 239 code << " data.t_id_y = threadIdx.y;\n"; 240 code << " data.t_id_z = threadIdx.z;\n"; 241 code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 242 code << " data.slice = slice+data.t_id_z*T_1D"<<(dim>1?"*T_1D":"")<<";\n"; 243 244 code << "\n // -- Input field constants and basis data --\n"; 245 //Initialize constants, and matrices B and G 246 for (CeedInt i = 0; i < num_input_fields; i++) { 247 code << " // ---- Input field "<<i<<" ----\n"; 248 // Get elem_size, eval_mode, num_comp 249 ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); 250 CeedChkBackend(ierr); 251 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 252 CeedChkBackend(ierr); 253 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 254 CeedChkBackend(ierr); 255 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 256 CeedChkBackend(ierr); 257 258 // Set field constants 259 if (eval_mode != CEED_EVAL_WEIGHT) { 260 ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 261 if (basis != CEED_BASIS_COLLOCATED) { 262 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 263 code << " const CeedInt P_in_"<<i<<" = "<<P_1d<<";\n"; 264 } else { 265 code << " const CeedInt P_in_"<<i<<" = "<<Q_1d<<";\n"; 266 } 267 code << " const CeedInt num_comp_in_"<<i<<" = "<<num_comp<<";\n"; 268 } 269 270 // Load basis data 271 code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 272 switch (eval_mode) { 273 case CEED_EVAL_NONE: 274 break; 275 case CEED_EVAL_INTERP: 276 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 277 data->B.inputs[i] = basis_data->d_interp_1d; 278 code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 279 code << " loadMatrix<P_in_"<<i<<",Q_1d>(data, B.inputs["<<i<<"], s_B_in_"<<i<<");\n"; 280 break; 281 case CEED_EVAL_GRAD: 282 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 283 data->B.inputs[i] = basis_data->d_interp_1d; 284 code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 285 code << " loadMatrix<P_in_"<<i<<",Q_1d>(data, B.inputs["<<i<<"], s_B_in_"<<i<<");\n"; 286 if (use_collograd_parallelization) { 287 data->G.inputs[i] = basis_data->d_collo_grad_1d; 288 code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q_1d*Q_1d<<"];\n"; 289 code << " loadMatrix<Q_1d,Q_1d>(data, G.inputs["<<i<<"], s_G_in_"<<i<<");\n"; 290 } else { 291 bool has_collo_grad = !!basis_data->d_collo_grad_1d; 292 data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 293 code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q_1d*(has_collo_grad?Q_1d:P_1d)<<"];\n"; 294 code << " loadMatrix<"<<(has_collo_grad?"Q_1d":("P_in_"+std::to_string(i)))<<",Q_1d>(data, G.inputs["<<i<<"], s_G_in_"<<i<<");\n"; 295 } 296 break; 297 case CEED_EVAL_WEIGHT: 298 break; // No action 299 case CEED_EVAL_DIV: 300 break; // TODO: Not implemented 301 case CEED_EVAL_CURL: 302 break; // TODO: Not implemented 303 } 304 } 305 306 code << "\n // -- Output field constants and basis data --\n"; 307 for (CeedInt i = 0; i < num_output_fields; i++) { 308 code << " // ---- Output field "<<i<<" ----\n"; 309 // Get elem_size, eval_mode, num_comp 310 ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict); 311 CeedChkBackend(ierr); 312 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 313 CeedChkBackend(ierr); 314 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 315 CeedChkBackend(ierr); 316 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 317 CeedChkBackend(ierr); 318 319 // Set field constants 320 ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 321 if (basis != CEED_BASIS_COLLOCATED) { 322 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 323 code << " const CeedInt P_out_"<<i<<" = "<<P_1d<<";\n"; 324 } else { 325 code << " const CeedInt P_out_"<<i<<" = "<<Q_1d<<";\n"; 326 } 327 code << " const CeedInt num_comp_out_"<<i<<" = "<<num_comp<<";\n"; 328 329 // Load basis data 330 code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 331 switch (eval_mode) { 332 case CEED_EVAL_NONE: 333 break; // No action 334 case CEED_EVAL_INTERP: 335 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 336 data->B.outputs[i] = basis_data->d_interp_1d; 337 code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 338 code << " loadMatrix<P_out_"<<i<<",Q_1d>(data, B.outputs["<<i<<"], s_B_out_"<<i<<");\n"; 339 break; 340 case CEED_EVAL_GRAD: 341 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 342 data->B.outputs[i] = basis_data->d_interp_1d; 343 code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 344 code << " loadMatrix<P_out_"<<i<<",Q_1d>(data, B.outputs["<<i<<"], s_B_out_"<<i<<");\n"; 345 if (use_collograd_parallelization) { 346 data->G.outputs[i] = basis_data->d_collo_grad_1d; 347 code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q_1d*Q_1d<<"];\n"; 348 code << " loadMatrix<Q_1d,Q_1d>(data, G.outputs["<<i<<"], s_G_out_"<<i<<");\n"; 349 } else { 350 bool has_collo_grad = !!basis_data->d_collo_grad_1d; 351 data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 352 code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q_1d*(has_collo_grad?Q_1d:P_1d)<<"];\n"; 353 code << " loadMatrix<"<<(has_collo_grad?"Q_1d":("P_out_"+std::to_string(i)))<<",Q_1d>(data, G.outputs["<<i<<"], s_G_out_"<<i<<");\n"; 354 } 355 break; 356 // LCOV_EXCL_START 357 case CEED_EVAL_WEIGHT: { 358 Ceed ceed; 359 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 360 return CeedError(ceed, CEED_ERROR_BACKEND, 361 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 362 break; // Should not occur 363 } 364 case CEED_EVAL_DIV: 365 break; // TODO: Not implemented 366 case CEED_EVAL_CURL: 367 break; // TODO: Not implemented 368 // LCOV_EXCL_STOP 369 } 370 } 371 code << "\n // -- Element loop --\n"; 372 code << " __syncthreads();\n"; 373 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 374 // Input basis apply if needed 375 // Generate the correct eval mode code for each input 376 code << " // -- Input field restrictions and basis actions --\n"; 377 for (CeedInt i = 0; i < num_input_fields; i++) { 378 code << " // ---- Input field "<<i<<" ----\n"; 379 // Get elem_size, eval_mode, num_comp 380 ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); 381 CeedChkBackend(ierr); 382 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 383 CeedChkBackend(ierr); 384 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 385 CeedChkBackend(ierr); 386 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 387 CeedChkBackend(ierr); 388 389 // Restriction 390 if (eval_mode != CEED_EVAL_WEIGHT && 391 !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { 392 code << " CeedScalar r_u_"<<i<<"[num_comp_in_"<<i<<"*P_in_"<<i<<"];\n"; 393 394 bool is_strided; 395 ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 396 if (!is_strided) { 397 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 398 CeedChkBackend(ierr); 399 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 400 CeedInt comp_stride; 401 ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 402 code << " // CompStride: "<<comp_stride<<"\n"; 403 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 404 data->indices.inputs[i] = restr_data->d_ind; 405 code << " readDofsOffset"<<dim<<"d<num_comp_in_"<<i<<", "<<comp_stride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.inputs["<<i<<"], d_u_"<<i<<", r_u_"<<i<<");\n"; 406 } else { 407 bool has_backend_strides; 408 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); 409 CeedChkBackend(ierr); 410 CeedInt num_elem; 411 ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 412 CeedChkBackend(ierr); 413 CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 414 if (!has_backend_strides) { 415 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 416 CeedChkBackend(ierr); 417 } 418 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 419 code << " readDofsStrided"<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u_"<<i<<", r_u_"<<i<<");\n"; 420 } 421 } 422 423 // Basis action 424 code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 425 switch (eval_mode) { 426 case CEED_EVAL_NONE: 427 if (!use_collograd_parallelization) { 428 code << " CeedScalar* r_t_"<<i<<" = r_u_"<<i<<";\n"; 429 } 430 break; 431 case CEED_EVAL_INTERP: 432 code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*Q_1d];\n"; 433 code << " Interp"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", r_t_"<<i<<");\n"; 434 break; 435 case CEED_EVAL_GRAD: 436 if (use_collograd_parallelization) { 437 code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*Q_1d];\n"; 438 code << " Interp"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", r_t_"<<i<<");\n"; 439 } else { 440 CeedInt P_1d; 441 ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 442 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 443 code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*dim*Q_1d];\n"; 444 code << " Grad"<<(dim>1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t_"<<i<<");\n"; 445 } 446 break; 447 case CEED_EVAL_WEIGHT: 448 code << " CeedScalar r_t_"<<i<<"[Q_1d];\n"; 449 ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 450 ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 451 data->W = basis_data->d_q_weight_1d; 452 code << " Weight"<<(dim>1?"Tensor":"")<<dim<<"d<Q_1d>(data, W, r_t_"<<i<<");\n"; 453 break; // No action 454 case CEED_EVAL_DIV: 455 break; // TODO: Not implemented 456 case CEED_EVAL_CURL: 457 break; // TODO: Not implemented 458 } 459 } 460 461 // Q function 462 code << "\n // -- Output field setup --\n"; 463 for (CeedInt i = 0; i < num_output_fields; i++) { 464 code << "\n // ---- Output field "<<i<<" ----\n"; 465 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 466 CeedChkBackend(ierr); 467 if (eval_mode==CEED_EVAL_GRAD) 468 { 469 if (use_collograd_parallelization) { 470 //Accumulator for gradient slices 471 code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*Q_1d];\n"; 472 code << " for (CeedInt i = 0; i < num_comp_out_"<<i<<"; i++) {\n"; 473 code << " for (CeedInt j = 0; j < Q_1d; ++j) {\n"; 474 code << " r_tt_"<<i<<"[j + i*Q_1d] = 0.0;\n"; 475 code << " }\n"; 476 code << " }\n"; 477 } else { 478 code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*dim*Q_1d];\n"; 479 } 480 } 481 if (eval_mode==CEED_EVAL_NONE || eval_mode==CEED_EVAL_INTERP) 482 { 483 code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*Q_1d];\n"; 484 } 485 } 486 // We treat quadrature points per slice in 3d to save registers 487 if (use_collograd_parallelization) { 488 code << "\n // Note: Using planes of 3D elements\n"; 489 code << "#pragma unroll\n"; 490 code << " for (CeedInt q = 0; q < Q_1d; q++) {\n"; 491 code << " // -- Input fields --\n"; 492 for (CeedInt i = 0; i < num_input_fields; i++) { 493 code << " // ---- Input field "<<i<<" ----\n"; 494 // Get elem_size, eval_mode, num_comp 495 ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 496 CeedChkBackend(ierr); 497 // Basis action 498 code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 499 switch (eval_mode) { 500 case CEED_EVAL_NONE: 501 code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"];\n"; 502 503 bool is_strided; 504 ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); CeedChkBackend(ierr); 505 ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 506 if (!is_strided) { 507 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 508 CeedChkBackend(ierr); 509 code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 510 CeedInt comp_stride; 511 ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 512 code << " // CompStride: "<<comp_stride<<"\n"; 513 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 514 data->indices.inputs[i] = restr_data->d_ind; 515 code << " readSliceQuadsOffset"<<"3d<num_comp_in_"<<i<<", "<<comp_stride<<", Q_1d>(data, lsize_in_"<<i<<", elem, q, indices.inputs["<<i<<"], d_u_"<<i<<", r_q_"<<i<<");\n"; 516 } else { 517 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); CeedChkBackend(ierr); 518 bool has_backend_strides; 519 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); 520 CeedChkBackend(ierr); 521 CeedInt num_elem; 522 ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 523 CeedChkBackend(ierr); 524 CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 525 if (!has_backend_strides) { 526 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 527 CeedChkBackend(ierr); 528 } 529 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 530 code << " readSliceQuadsStrided"<<"3d<num_comp_in_"<<i<<",Q_1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u_"<<i<<", r_q_"<<i<<");\n"; 531 } 532 break; 533 case CEED_EVAL_INTERP: 534 code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"];\n"; 535 code << " for (CeedInt j = 0; j < num_comp_in_"<<i<<" ; ++j) {\n"; 536 code << " r_q_"<<i<<"[j] = r_t_"<<i<<"[q + j*Q_1d];\n"; 537 code << " }\n"; 538 break; 539 case CEED_EVAL_GRAD: 540 code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"*dim];\n"; 541 code << " gradCollo3d<num_comp_in_"<<i<<",Q_1d>(data, q, r_t_"<<i<<", s_G_in_"<<i<<", r_q_"<<i<<");\n"; 542 break; 543 case CEED_EVAL_WEIGHT: 544 code << " CeedScalar r_q_"<<i<<"[1];\n"; 545 code << " r_q_"<<i<<"[0] = r_t_"<<i<<"[q];\n"; 546 break; // No action 547 case CEED_EVAL_DIV: 548 break; // TODO: Not implemented 549 case CEED_EVAL_CURL: 550 break; // TODO: Not implemented 551 } 552 } 553 code << "\n // -- Output fields --\n"; 554 for (CeedInt i = 0; i < num_output_fields; i++) { 555 code << " // ---- Output field "<<i<<" ----\n"; 556 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 557 CeedChkBackend(ierr); 558 // Basis action 559 switch (eval_mode) { 560 case CEED_EVAL_NONE: 561 code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"];\n"; 562 break; // No action 563 case CEED_EVAL_INTERP: 564 code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"];\n"; 565 break; 566 case CEED_EVAL_GRAD: 567 code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"*dim];\n"; 568 break; 569 case CEED_EVAL_WEIGHT: 570 break; // Should not occur 571 case CEED_EVAL_DIV: 572 break; // TODO: Not implemented 573 case CEED_EVAL_CURL: 574 break; // TODO: Not implemented 575 } 576 } 577 } else { 578 code << "\n // Note: Using full elements\n"; 579 code << " // -- Input fields --\n"; 580 for (CeedInt i = 0; i < num_input_fields; i++) { 581 code << " // ---- Input field "<<i<<" ----\n"; 582 code << " CeedScalar* r_q_"<<i<<" = r_t_"<<i<<";\n"; 583 } 584 code << " // -- Output fields --\n"; 585 for (CeedInt i = 0; i < num_output_fields; i++) { 586 code << " // ---- Output field "<<i<<" ----\n"; 587 code << " CeedScalar* r_qq_"<<i<<" = r_tt_"<<i<<";\n"; 588 } 589 } 590 code << "\n // -- QFunction Inputs and outputs --\n"; 591 code << " CeedScalar* in["<<num_input_fields<<"];\n"; 592 for (CeedInt i = 0; i < num_input_fields; i++) { 593 code << " // ---- Input field "<<i<<" ----\n"; 594 code << " in["<<i<<"] = r_q_"<<i<<";\n"; 595 } 596 code << " CeedScalar* out["<<num_output_fields<<"];\n"; 597 for (CeedInt i = 0; i < num_output_fields; i++) { 598 code << " // ---- Output field "<<i<<" ----\n"; 599 code << " out["<<i<<"] = r_qq_"<<i<<";\n"; 600 } 601 code << "\n // -- Apply QFunction --\n"; 602 code << " "<<q_function_name<<"(ctx, "; 603 if (dim != 3 || use_collograd_parallelization) { 604 code << "1"; 605 } else { 606 code << "Q_1d"; 607 } 608 code << ", in, out);\n"; 609 if (use_collograd_parallelization) { 610 code << " // -- Output fields --\n"; 611 for (CeedInt i = 0; i < num_output_fields; i++) { 612 code << " // ---- Output field "<<i<<" ----\n"; 613 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 614 CeedChkBackend(ierr); 615 // Basis action 616 code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 617 switch (eval_mode) { 618 case CEED_EVAL_NONE: 619 code << " for (CeedInt j = 0; j < num_comp_out_"<<i<<" ; ++j) {\n"; 620 code << " r_tt_"<<i<<"[q + j*Q_1d] = r_qq_"<<i<<"[j];\n"; 621 code << " }\n"; 622 break; // No action 623 case CEED_EVAL_INTERP: 624 code << " for (CeedInt j = 0; j < num_comp_out_"<<i<<" ; ++j) {\n"; 625 code << " r_tt_"<<i<<"[q + j*Q_1d] = r_qq_"<<i<<"[j];\n"; 626 code << " }\n"; 627 break; 628 case CEED_EVAL_GRAD: 629 code << " gradColloTranspose3d<num_comp_out_"<<i<<",Q_1d>(data, q, r_qq_"<<i<<", s_G_out_"<<i<<", r_tt_"<<i<<");\n"; 630 break; 631 case CEED_EVAL_WEIGHT: 632 break; // Should not occur 633 case CEED_EVAL_DIV: 634 break; // TODO: Not implemented 635 case CEED_EVAL_CURL: 636 break; // TODO: Not implemented 637 } 638 } 639 code << " }\n"; 640 } 641 642 // Output basis apply if needed 643 // Generate the correct eval mode code for each output 644 code << "\n // -- Output field basis action and restrictions --\n"; 645 for (CeedInt i = 0; i < num_output_fields; i++) { 646 code << " // ---- Output field "<<i<<" ----\n"; 647 // Get elem_size, eval_mode, num_comp 648 ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict); 649 CeedChkBackend(ierr); 650 ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 651 CeedChkBackend(ierr); 652 ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 653 CeedChkBackend(ierr); 654 ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 655 CeedChkBackend(ierr); 656 // Basis action 657 code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 658 switch (eval_mode) { 659 case CEED_EVAL_NONE: 660 code << " CeedScalar* r_v_"<<i<<" = r_tt_"<<i<<";\n"; 661 break; // No action 662 case CEED_EVAL_INTERP: 663 code << " CeedScalar r_v_"<<i<<"[num_comp_out_"<<i<<"*P_out_"<<i<<"];\n"; 664 code << " InterpTranspose"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", r_v_"<<i<<");\n"; 665 break; 666 case CEED_EVAL_GRAD: 667 code << " CeedScalar r_v_"<<i<<"[num_comp_out_"<<i<<"*P_out_"<<i<<"];\n"; 668 if (use_collograd_parallelization) { 669 code << " InterpTranspose"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", r_v_"<<i<<");\n"; 670 } else { 671 CeedInt P_1d; 672 ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 673 ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 674 code << " GradTranspose"<<(dim>1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v_"<<i<<");\n"; 675 } 676 break; 677 // LCOV_EXCL_START 678 case CEED_EVAL_WEIGHT: { 679 Ceed ceed; 680 ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 681 return CeedError(ceed, CEED_ERROR_BACKEND, 682 "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 683 break; // Should not occur 684 } 685 case CEED_EVAL_DIV: 686 break; // TODO: Not implemented 687 case CEED_EVAL_CURL: 688 break; // TODO: Not implemented 689 // LCOV_EXCL_STOP 690 } 691 // Restriction 692 bool is_strided; 693 ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 694 if (!is_strided) { 695 ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 696 CeedChkBackend(ierr); 697 code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 698 CeedInt comp_stride; 699 ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 700 code << " // CompStride: "<<comp_stride<<"\n"; 701 ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 702 data->indices.outputs[i] = restr_data->d_ind; 703 code << " writeDofsOffset"<<dim<<"d<num_comp_out_"<<i<<", "<<comp_stride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.outputs["<<i<<"], r_v_"<<i<<", d_v_"<<i<<");\n"; 704 } else { 705 bool has_backend_strides; 706 ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); 707 CeedChkBackend(ierr); 708 CeedInt num_elem; 709 ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 710 CeedChkBackend(ierr); 711 CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 712 if (!has_backend_strides) { 713 ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 714 CeedChkBackend(ierr); 715 } 716 code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 717 code << " writeDofsStrided"<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v_"<<i<<", d_v_"<<i<<");\n"; 718 } 719 } 720 721 code << " }\n"; 722 code << "}\n"; 723 code << "// -----------------------------------------------------------------------------\n\n"; 724 725 // View kernel for debugging 726 CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 727 CeedDebug(ceed, code.str().c_str()); 728 729 CeedInt block_sizes[3] = {0, 0, 0}; 730 CeedInt num_elem; 731 ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr); 732 ierr = BlockGridCalculate_Hip_gen(dim, num_elem, data->max_P_1d, Q_1d, block_sizes); 733 CeedChkBackend(ierr); 734 ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2, 735 "T_1D", block_sizes[0], 736 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]); 737 CeedChkBackend(ierr); 738 ierr = CeedGetKernelHip(ceed, data->module, operator_name.c_str(), &data->op); 739 CeedChkBackend(ierr); 740 741 ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 742 return CEED_ERROR_SUCCESS; 743 } 744 //------------------------------------------------------------------------------ 745