1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #define CEED_DEBUG_COLOR 12 9 10 #include <ceed.h> 11 #include <ceed/backend.h> 12 #include <ceed/jit-tools.h> 13 14 #include <iostream> 15 #include <sstream> 16 #include <string> 17 18 #include "../hip-ref/ceed-hip-ref.h" 19 #include "../hip-shared/ceed-hip-shared.h" 20 #include "../hip/ceed-hip-common.h" 21 #include "../hip/ceed-hip-compile.h" 22 #include "ceed-hip-gen.h" 23 24 //------------------------------------------------------------------------------ 25 // Calculate the block size used for launching the operator kernel 26 //------------------------------------------------------------------------------ 27 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { 28 const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 29 if (dim == 1) { 30 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 31 32 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 33 block_sizes[0] = thread_1d; 34 block_sizes[1] = 1; 35 block_sizes[2] = elems_per_block; 36 } else if (dim == 2) { 37 const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2; 38 39 block_sizes[0] = thread_1d; 40 block_sizes[1] = thread_1d; 41 block_sizes[2] = elems_per_block; 42 } else if (dim == 3) { 43 const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1); 44 45 block_sizes[0] = thread_1d; 46 block_sizes[1] = thread_1d; 47 block_sizes[2] = elems_per_block; 48 } 49 return CEED_ERROR_SUCCESS; 50 } 51 52 //------------------------------------------------------------------------------ 53 // Determine type of operator 54 //------------------------------------------------------------------------------ 55 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 56 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields, 57 CeedQFunctionField *qf_output_fields, CeedInt *max_P_1d, CeedInt *Q_1d, CeedInt *dim, bool *is_tensor, 58 bool *use_3d_slices) { 59 // Find dim, P_1d, Q_1d 60 *max_P_1d = 0; 61 *Q_1d = 0; 62 *dim = 0; 63 *is_tensor = true; 64 65 for (CeedInt i = 0; i < num_input_fields; i++) { 66 CeedBasis basis; 67 68 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 69 if (basis != CEED_BASIS_NONE) { 70 bool is_field_tensor; 71 CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0; 72 73 // Collect dim, P_1d, and Q_1d 74 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 75 *is_tensor = *is_tensor && is_field_tensor; 76 if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 77 else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d)); 78 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 79 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 80 CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 81 *dim = field_dim; 82 if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 83 else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d)); 84 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 85 *Q_1d = field_Q_1d; 86 } 87 CeedCallBackend(CeedBasisDestroy(&basis)); 88 } 89 for (CeedInt i = 0; i < num_output_fields; i++) { 90 CeedBasis basis; 91 92 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 93 if (basis != CEED_BASIS_NONE) { 94 bool is_field_tensor; 95 CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0; 96 97 // Collect dim, P_1d, and Q_1d 98 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 99 *is_tensor = *is_tensor && is_field_tensor; 100 if (is_field_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 101 else CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P_1d)); 102 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 103 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 104 CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 105 *dim = field_dim; 106 if (is_field_tensor) CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 107 else CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q_1d)); 108 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 109 *Q_1d = field_Q_1d; 110 } 111 CeedCallBackend(CeedBasisDestroy(&basis)); 112 } 113 114 // Only use 3D collocated gradient parallelization strategy when gradient is computed 115 *use_3d_slices = false; 116 if (*dim == 3) { 117 bool was_grad_found = false; 118 119 for (CeedInt i = 0; i < num_input_fields; i++) { 120 CeedEvalMode eval_mode; 121 122 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 123 if (eval_mode == CEED_EVAL_GRAD) { 124 CeedBasis_Hip_shared *basis_data; 125 CeedBasis basis; 126 127 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 128 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 129 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 130 was_grad_found = true; 131 CeedCallBackend(CeedBasisDestroy(&basis)); 132 } 133 } 134 for (CeedInt i = 0; i < num_output_fields; i++) { 135 CeedEvalMode eval_mode; 136 137 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 138 if (eval_mode == CEED_EVAL_GRAD) { 139 CeedBasis_Hip_shared *basis_data; 140 CeedBasis basis; 141 142 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 143 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 144 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 145 was_grad_found = true; 146 CeedCallBackend(CeedBasisDestroy(&basis)); 147 } 148 } 149 } 150 return CEED_ERROR_SUCCESS; 151 } 152 153 //------------------------------------------------------------------------------ 154 // Setup fields 155 //------------------------------------------------------------------------------ 156 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field, 157 CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points, 158 bool use_3d_slices) { 159 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 160 std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 161 std::string option_name = (is_input ? "inputs" : "outputs"); 162 CeedEvalMode eval_mode = CEED_EVAL_NONE; 163 CeedInt elem_size = 0, num_comp = 0, P_1d = 0; 164 CeedElemRestriction elem_rstr; 165 CeedBasis_Hip_shared *basis_data; 166 CeedBasis basis; 167 168 code << " // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n"; 169 170 // Get field data 171 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 172 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 173 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 174 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 175 } 176 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 177 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 178 if (basis != CEED_BASIS_NONE) { 179 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 180 if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 181 else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 182 } 183 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 184 185 // Set field constants 186 if (eval_mode != CEED_EVAL_WEIGHT) { 187 code << " const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n"; 188 code << " const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; 189 } 190 191 // Load basis data 192 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 193 switch (eval_mode) { 194 case CEED_EVAL_NONE: 195 break; 196 case CEED_EVAL_INTERP: 197 if (is_at_points) { 198 // AtPoints 199 if (!basis_data->d_chebyshev_interp_1d) { 200 CeedSize interp_bytes; 201 CeedScalar *chebyshev_interp_1d; 202 203 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 204 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 205 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 206 CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 207 CeedCallHip(CeedBasisReturnCeed(basis), 208 hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 209 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 210 } 211 if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 212 else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 213 } else { 214 // Standard quadrature 215 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 216 else data->B.outputs[i] = basis_data->d_interp_1d; 217 } 218 code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 219 code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 220 break; 221 case CEED_EVAL_GRAD: 222 if (is_at_points) { 223 // AtPoints 224 if (!basis_data->d_chebyshev_interp_1d) { 225 CeedSize interp_bytes; 226 CeedScalar *chebyshev_interp_1d; 227 228 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 229 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 230 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 231 CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 232 CeedCallHip(CeedBasisReturnCeed(basis), 233 hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 234 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 235 } 236 if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 237 else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 238 } else { 239 // Standard quadrature 240 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 241 else data->B.outputs[i] = basis_data->d_interp_1d; 242 } 243 if (is_tensor) { 244 code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 245 code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 246 } 247 if (is_at_points) break; // No G mat for AtPoints 248 if (use_3d_slices) { 249 if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d; 250 else data->G.outputs[i] = basis_data->d_collo_grad_1d; 251 code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 252 code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 253 } else { 254 bool has_collo_grad = basis_data->d_collo_grad_1d; 255 256 if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 257 else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 258 if (has_collo_grad) { 259 code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 260 code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 261 } else { 262 code << " __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n"; 263 code << " LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G" 264 << var_suffix << ");\n"; 265 } 266 } 267 break; 268 case CEED_EVAL_WEIGHT: 269 break; // No action 270 // LCOV_EXCL_START 271 case CEED_EVAL_DIV: 272 case CEED_EVAL_CURL: 273 break; // TODO: Not implemented 274 // LCOV_EXCL_STOP 275 } 276 CeedCallBackend(CeedBasisDestroy(&basis)); 277 return CEED_ERROR_SUCCESS; 278 } 279 280 //------------------------------------------------------------------------------ 281 // Restriction 282 //------------------------------------------------------------------------------ 283 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim, 284 CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, 285 CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points, bool use_3d_slices) { 286 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 287 std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix; 288 CeedEvalMode eval_mode = CEED_EVAL_NONE; 289 CeedInt elem_size = 0, num_comp = 0, P_1d = 0; 290 CeedSize l_size; 291 CeedRestrictionType rstr_type = CEED_RESTRICTION_STANDARD; 292 CeedElemRestriction_Hip *rstr_data; 293 CeedElemRestriction elem_rstr; 294 CeedBasis basis; 295 296 // Get field data 297 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 298 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 299 CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); 300 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 301 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 302 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 303 } 304 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 305 if (basis != CEED_BASIS_NONE) { 306 if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 307 else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 308 } 309 CeedCallBackend(CeedBasisDestroy(&basis)); 310 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 311 312 // Restriction 313 if (is_input) { 314 // Input 315 if (field_input_buffer[i] != i) { 316 std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]); 317 318 // Restriction was already done for previous input 319 code << " CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n"; 320 } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) { 321 if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) { 322 // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated 323 code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n"; 324 } else if (rstr_type != CEED_RESTRICTION_POINTS) { 325 // Otherwise we're using the scratch space 326 code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 327 } 328 switch (rstr_type) { 329 case CEED_RESTRICTION_STANDARD: { 330 CeedInt comp_stride; 331 332 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 333 code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 334 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 335 code << " // CompStride: " << comp_stride << "\n"; 336 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 337 code << " ReadLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name 338 << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n"; 339 break; 340 } 341 case CEED_RESTRICTION_STRIDED: { 342 bool has_backend_strides; 343 CeedInt num_elem; 344 345 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 346 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 347 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 348 349 if (!has_backend_strides) { 350 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 351 } 352 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 353 code << " ReadLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", " 354 << strides[1] << ", " << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n"; 355 break; 356 } 357 case CEED_RESTRICTION_POINTS: { 358 CeedInt comp_stride; 359 360 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 361 code << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 362 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 363 break; 364 } 365 // LCOV_EXCL_START 366 case CEED_RESTRICTION_ORIENTED: 367 case CEED_RESTRICTION_CURL_ORIENTED: 368 break; // TODO: Not implemented 369 // LCOV_EXCL_STOP 370 } 371 } 372 } else { 373 // Output 374 switch (rstr_type) { 375 case CEED_RESTRICTION_STANDARD: { 376 CeedInt comp_stride; 377 378 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 379 code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 380 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 381 code << " // CompStride: " << comp_stride << "\n"; 382 data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 383 code << " WriteLVecStandard" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name 384 << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n"; 385 break; 386 } 387 case CEED_RESTRICTION_STRIDED: { 388 bool has_backend_strides; 389 CeedInt num_elem; 390 391 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 392 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 393 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 394 395 if (!has_backend_strides) { 396 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 397 } 398 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 399 code << " WriteLVecStrided" << (is_tensor ? dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", " << strides[0] << ", " 400 << strides[1] << ", " << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n"; 401 break; 402 } 403 case CEED_RESTRICTION_POINTS: 404 data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 405 break; 406 // LCOV_EXCL_START 407 case CEED_RESTRICTION_ORIENTED: 408 case CEED_RESTRICTION_CURL_ORIENTED: 409 break; // TODO: Not implemented 410 // LCOV_EXCL_STOP 411 } 412 } 413 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 414 return CEED_ERROR_SUCCESS; 415 } 416 417 //------------------------------------------------------------------------------ 418 // Basis 419 //------------------------------------------------------------------------------ 420 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim, 421 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor, 422 bool is_at_points, bool use_3d_slices) { 423 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 424 std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 425 CeedEvalMode eval_mode = CEED_EVAL_NONE; 426 CeedInt elem_size = 0, num_comp = 0, P_1d = 0; 427 CeedElemRestriction elem_rstr; 428 CeedBasis basis; 429 430 // Get field data 431 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 432 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 433 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 434 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 435 } 436 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 437 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 438 if (basis != CEED_BASIS_NONE) { 439 if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 440 else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 441 } 442 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 443 444 // Basis 445 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 446 if (is_input) { 447 switch (eval_mode) { 448 case CEED_EVAL_NONE: 449 if (!use_3d_slices && !is_at_points) { 450 code << " CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n"; 451 } 452 break; 453 case CEED_EVAL_INTERP: 454 if (is_at_points) { 455 std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 456 457 code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 458 code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" 459 << var_suffix << ", r_c" << var_suffix << ");\n"; 460 } else { 461 std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d") : "InterpNonTensor"; 462 463 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 464 code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" 465 << var_suffix << ", r_q" << var_suffix << ");\n"; 466 } 467 break; 468 case CEED_EVAL_GRAD: 469 if (is_at_points) { 470 std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 471 472 code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 473 code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" 474 << var_suffix << ", r_c" << var_suffix << ");\n"; 475 } else if (use_3d_slices) { 476 std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d"; 477 478 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 479 code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" 480 << var_suffix << ", r_q" << var_suffix << ");\n"; 481 } else if (is_tensor) { 482 bool is_collocated = dim == 3 && Q_1d >= P_1d; 483 std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d"; 484 485 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (dim >= 3 ? Q_name : "1") << "];\n"; 486 code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" 487 << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 488 } else { 489 std::string function_name = "GradNonTensor"; 490 491 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 492 code << " " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_e" << var_suffix 493 << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 494 } 495 break; 496 case CEED_EVAL_WEIGHT: { 497 if (is_at_points) { 498 code << " // Nothing to do AtPoints\n"; 499 } else { 500 CeedBasis_Hip_shared *basis_data; 501 std::string function_name = is_tensor ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d") : "WeightNonTensor"; 502 503 code << " CeedScalar r_q" << var_suffix << "[" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 504 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 505 data->W = basis_data->d_q_weight_1d; 506 code << " " << function_name << "<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n"; 507 } 508 break; 509 } 510 // LCOV_EXCL_START 511 case CEED_EVAL_DIV: 512 case CEED_EVAL_CURL: 513 break; // TODO: Not implemented 514 // LCOV_EXCL_STOP 515 } 516 } else { 517 switch (eval_mode) { 518 case CEED_EVAL_NONE: 519 code << " CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n"; 520 break; // No action 521 case CEED_EVAL_INTERP: 522 code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 523 if (is_at_points) { 524 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 525 526 code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B" 527 << var_suffix << ", r_e" << var_suffix << ");\n"; 528 } else { 529 std::string function_name = 530 is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d") : "InterpTransposeNonTensor"; 531 532 code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B" 533 << var_suffix << ", r_e" << var_suffix << ");\n"; 534 } 535 break; 536 case CEED_EVAL_GRAD: 537 code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 538 if (is_at_points) { 539 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 540 541 code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_c" << var_suffix << ", s_B" 542 << var_suffix << ", r_e" << var_suffix << ");\n"; 543 } else if (use_3d_slices) { 544 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 545 546 code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B" 547 << var_suffix << ", r_e" << var_suffix << ");\n"; 548 } else if (is_tensor) { 549 bool is_collocated = dim == 3 && Q_1d >= P_1d; 550 std::string function_name = 551 (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) + std::to_string(dim) + "d"; 552 553 code << " " << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix << ", s_B" 554 << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 555 } else { 556 std::string function_name = "GradTransposeNonTensor"; 557 558 code << " " << function_name << "<num_comp" << var_suffix << ", dim, " << P_name << ", " << Q_name << ">(data, r_q" << var_suffix 559 << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 560 } 561 break; 562 // LCOV_EXCL_START 563 case CEED_EVAL_WEIGHT: 564 break; // Should not occur 565 case CEED_EVAL_DIV: 566 case CEED_EVAL_CURL: 567 break; // TODO: Not implemented 568 // LCOV_EXCL_STOP 569 } 570 } 571 CeedCallBackend(CeedBasisDestroy(&basis)); 572 return CEED_ERROR_SUCCESS; 573 } 574 575 //------------------------------------------------------------------------------ 576 // QFunction 577 //------------------------------------------------------------------------------ 578 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt dim, CeedInt max_num_points, 579 CeedInt num_input_fields, CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields, 580 CeedInt num_output_fields, CeedOperatorField *op_output_fields, 581 CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d, bool is_tensor, 582 bool is_at_points, bool use_3d_slices) { 583 std::string Q_name = is_tensor ? "Q_1d" : "Q"; 584 CeedEvalMode eval_mode = CEED_EVAL_NONE; 585 CeedElemRestriction elem_rstr; 586 587 // Setup output arrays 588 code << "\n // -- Output field setup\n"; 589 for (CeedInt i = 0; i < num_output_fields; i++) { 590 std::string var_suffix = "_out_" + std::to_string(i); 591 592 code << " // ---- Output field " << i << "\n"; 593 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 594 switch (eval_mode) { 595 case CEED_EVAL_NONE: 596 if (is_at_points) { 597 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n"; 598 } else { 599 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 600 } 601 break; 602 case CEED_EVAL_INTERP: 603 if (is_at_points) { 604 // Accumulator for point data 605 code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 606 code << " for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n"; 607 code << " r_c" << var_suffix << "[i] = 0.0;\n"; 608 code << " }\n"; 609 } else { 610 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 611 } 612 break; 613 case CEED_EVAL_GRAD: 614 if (is_at_points) { 615 // Accumulator for point data 616 code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "*dim];\n"; 617 code << " for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "; i++) {\n"; 618 code << " r_c" << var_suffix << "[i] = 0.0;\n"; 619 code << " }\n"; 620 } else if (use_3d_slices) { 621 // Accumulator for gradient slices 622 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 623 code << " for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n"; 624 code << " r_q" << var_suffix << "[i] = 0.0;\n"; 625 code << " }\n"; 626 } else { 627 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << (is_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 628 } 629 break; 630 case CEED_EVAL_WEIGHT: 631 break; 632 // LCOV_EXCL_START 633 case CEED_EVAL_DIV: 634 case CEED_EVAL_CURL: 635 break; // TODO: Not implemented 636 // LCOV_EXCL_STOP 637 } 638 } 639 640 if (is_at_points) { 641 // We need to handle batches of points 642 code << "\n // Note: Using batches of points\n"; 643 code << " const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));\n\n"; 644 code << " #pragma unroll\n"; 645 code << " for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {\n"; 646 code << " const CeedInt p = i % max_num_points;\n\n"; 647 648 code << " // -- Coordinates\n"; 649 code << " CeedScalar r_x[dim];\n"; 650 code << " ReadPoint<dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n"; 651 652 code << " // -- Input fields\n"; 653 for (CeedInt i = 0; i < num_input_fields; i++) { 654 std::string var_suffix = "_in_" + std::to_string(i); 655 656 code << " // ---- Input field " << i << "\n"; 657 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 658 // Basis action 659 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 660 switch (eval_mode) { 661 case CEED_EVAL_NONE: 662 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 663 code << " ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 664 << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 665 break; 666 case CEED_EVAL_INTERP: 667 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 668 code << " InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix 669 << ", r_x, r_s" << var_suffix << ");\n"; 670 break; 671 case CEED_EVAL_GRAD: 672 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 673 code << " GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix 674 << ", r_x, r_s" << var_suffix << ");\n"; 675 break; 676 case CEED_EVAL_WEIGHT: 677 code << " CeedScalar r_s" << var_suffix << "[1];\n"; 678 code << " r_s" << var_suffix << "[0] = 1.0;\n"; 679 break; 680 // LCOV_EXCL_START 681 case CEED_EVAL_DIV: 682 case CEED_EVAL_CURL: 683 break; // TODO: Not implemented 684 // LCOV_EXCL_STOP 685 } 686 } 687 code << "\n // -- Output fields\n"; 688 for (CeedInt i = 0; i < num_output_fields; i++) { 689 std::string var_suffix = "_out_" + std::to_string(i); 690 691 code << " // ---- Output field " << i << "\n"; 692 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 693 // Basis action 694 switch (eval_mode) { 695 case CEED_EVAL_NONE: 696 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 697 break; 698 case CEED_EVAL_INTERP: 699 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 700 break; 701 case CEED_EVAL_GRAD: 702 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 703 break; 704 // LCOV_EXCL_START 705 case CEED_EVAL_WEIGHT: 706 break; // Should not occur 707 case CEED_EVAL_DIV: 708 case CEED_EVAL_CURL: 709 break; // TODO: Not implemented 710 // LCOV_EXCL_STOP 711 } 712 } 713 714 } else if (use_3d_slices) { 715 // We treat quadrature points per slice in 3d to save registers 716 code << "\n // Note: Using planes of 3D elements\n"; 717 code << " #pragma unroll\n"; 718 code << " for (CeedInt q = 0; q < " << Q_name << "; q++) {\n"; 719 code << " // -- Input fields\n"; 720 for (CeedInt i = 0; i < num_input_fields; i++) { 721 std::string var_suffix = "_in_" + std::to_string(i); 722 723 code << " // ---- Input field " << i << "\n"; 724 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 725 // Basis action 726 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 727 switch (eval_mode) { 728 case CEED_EVAL_NONE: 729 bool is_strided; 730 731 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 732 733 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 734 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 735 if (is_strided) { 736 bool has_backend_strides; 737 CeedInt num_elem, elem_size; 738 739 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 740 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 741 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 742 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 743 744 if (!has_backend_strides) { 745 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 746 } 747 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 748 code << " ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", " << strides[0] << ", " << strides[1] << ", " 749 << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n"; 750 } else { 751 CeedSize l_size = 0; 752 CeedInt comp_stride; 753 CeedElemRestriction_Hip *rstr_data; 754 755 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 756 code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 757 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 758 code << " // CompStride: " << comp_stride << "\n"; 759 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 760 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 761 code << " ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix 762 << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 763 } 764 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 765 break; 766 case CEED_EVAL_INTERP: 767 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 768 code << " for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n"; 769 code << " r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n"; 770 code << " }\n"; 771 break; 772 case CEED_EVAL_GRAD: 773 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 774 code << " GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix 775 << ", r_s" << var_suffix << ");\n"; 776 break; 777 case CEED_EVAL_WEIGHT: 778 code << " CeedScalar r_s" << var_suffix << "[1];\n"; 779 code << " r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n"; 780 break; 781 // LCOV_EXCL_START 782 case CEED_EVAL_DIV: 783 case CEED_EVAL_CURL: 784 break; // TODO: Not implemented 785 // LCOV_EXCL_STOP 786 } 787 } 788 code << "\n // -- Output fields\n"; 789 for (CeedInt i = 0; i < num_output_fields; i++) { 790 std::string var_suffix = "_out_" + std::to_string(i); 791 792 code << " // ---- Output field " << i << "\n"; 793 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 794 // Basis action 795 switch (eval_mode) { 796 case CEED_EVAL_NONE: 797 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 798 break; 799 case CEED_EVAL_INTERP: 800 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 801 break; 802 case CEED_EVAL_GRAD: 803 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 804 break; 805 // LCOV_EXCL_START 806 case CEED_EVAL_WEIGHT: 807 break; // Should not occur 808 case CEED_EVAL_DIV: 809 case CEED_EVAL_CURL: 810 break; // TODO: Not implemented 811 // LCOV_EXCL_STOP 812 } 813 } 814 } else { 815 code << "\n // Note: Using full elements\n"; 816 code << " {\n"; 817 code << " // -- Input fields\n"; 818 for (CeedInt i = 0; i < num_input_fields; i++) { 819 code << " // ---- Input field " << i << "\n"; 820 code << " CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n"; 821 } 822 code << " // -- Output fields\n"; 823 for (CeedInt i = 0; i < num_output_fields; i++) { 824 code << " // ---- Output field " << i << "\n"; 825 code << " CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n"; 826 } 827 } 828 829 // Input and output buffers 830 code << "\n // -- QFunction inputs and outputs\n"; 831 code << " // ---- Inputs\n"; 832 code << " CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; 833 for (CeedInt i = 0; i < num_input_fields; i++) { 834 code << " // ------ Input field " << i << "\n"; 835 code << " inputs[" << i << "] = r_s_in_" << i << ";\n"; 836 } 837 code << " // ---- Outputs\n"; 838 code << " CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; 839 for (CeedInt i = 0; i < num_output_fields; i++) { 840 code << " // ------ Output field " << i << "\n"; 841 code << " outputs[" << i << "] = r_s_out_" << i << ";\n"; 842 } 843 844 // Apply QFunction 845 code << "\n // -- Apply QFunction\n"; 846 code << " " << qfunction_name << "(ctx, "; 847 if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) { 848 code << "1"; 849 } else { 850 code << Q_name; 851 } 852 code << ", inputs, outputs);\n"; 853 854 if (is_at_points) { 855 // Map back to coefficients 856 code << "\n // -- Output fields\n"; 857 for (CeedInt i = 0; i < num_output_fields; i++) { 858 std::string var_suffix = "_out_" + std::to_string(i); 859 std::string P_name = "P_1d" + var_suffix; 860 861 code << " // ---- Output field " << i << "\n"; 862 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 863 // Basis action 864 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 865 switch (eval_mode) { 866 case CEED_EVAL_NONE: { 867 CeedInt comp_stride; 868 CeedElemRestriction elem_rstr; 869 870 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 871 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 872 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 873 code << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 874 code << " WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 875 << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]" 876 << ", r_s" << var_suffix << ", d" << var_suffix << ");\n"; 877 break; 878 } 879 case CEED_EVAL_INTERP: 880 code << " if (i >= points.num_per_elem[elem]) {\n"; 881 code << " for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 882 code << " }\n"; 883 code << " InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s" 884 << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 885 break; 886 case CEED_EVAL_GRAD: 887 code << " if (i >= points.num_per_elem[elem]) {\n"; 888 code << " for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 889 code << " }\n"; 890 code << " GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s" 891 << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 892 break; 893 // LCOV_EXCL_START 894 case CEED_EVAL_WEIGHT: 895 break; // Should not occur 896 case CEED_EVAL_DIV: 897 case CEED_EVAL_CURL: 898 break; // TODO: Not implemented 899 // LCOV_EXCL_STOP 900 } 901 } 902 } else if (use_3d_slices) { 903 // Copy or apply transpose grad, if needed 904 code << "\n // -- Output fields\n"; 905 for (CeedInt i = 0; i < num_output_fields; i++) { 906 std::string var_suffix = "_out_" + std::to_string(i); 907 std::string P_name = "P_1d" + var_suffix; 908 909 code << " // ---- Output field " << i << "\n"; 910 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 911 // Basis action 912 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 913 switch (eval_mode) { 914 case CEED_EVAL_NONE: 915 code << " for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 916 code << " r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 917 code << " }\n"; 918 break; 919 case CEED_EVAL_INTERP: 920 code << " for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 921 code << " r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 922 code << " }\n"; 923 break; 924 case CEED_EVAL_GRAD: 925 code << " GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G" 926 << var_suffix << ", r_q" << var_suffix << ");\n"; 927 break; 928 // LCOV_EXCL_START 929 case CEED_EVAL_WEIGHT: 930 break; // Should not occur 931 case CEED_EVAL_DIV: 932 case CEED_EVAL_CURL: 933 break; // TODO: Not implemented 934 // LCOV_EXCL_STOP 935 } 936 } 937 } 938 code << " }\n"; 939 return CEED_ERROR_SUCCESS; 940 } 941 942 //------------------------------------------------------------------------------ 943 // Build single operator kernel 944 //------------------------------------------------------------------------------ 945 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) { 946 bool is_tensor = true, is_at_points = false, use_3d_slices = false; 947 Ceed ceed; 948 CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0; 949 CeedQFunctionField *qf_input_fields, *qf_output_fields; 950 CeedQFunction_Hip_gen *qf_data; 951 CeedQFunction qf; 952 CeedOperatorField *op_input_fields, *op_output_fields; 953 CeedOperator_Hip_gen *data; 954 std::ostringstream code; 955 956 CeedCallBackend(CeedOperatorGetData(op, &data)); 957 { 958 bool is_setup_done; 959 960 CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 961 if (is_setup_done) { 962 *is_good_build = !data->use_fallback; 963 return CEED_ERROR_SUCCESS; 964 } 965 } 966 967 // Check field compatibility 968 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 969 { 970 bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true; 971 972 for (CeedInt i = 0; i < num_input_fields; i++) { 973 CeedBasis basis; 974 975 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 976 if (basis != CEED_BASIS_NONE) { 977 bool is_tensor = true; 978 const char *resource; 979 char *resource_root; 980 Ceed basis_ceed; 981 982 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 983 is_all_tensor &= is_tensor; 984 is_all_nontensor &= !is_tensor; 985 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 986 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 987 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 988 has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared"); 989 CeedCallBackend(CeedFree(&resource_root)); 990 CeedCallBackend(CeedDestroy(&basis_ceed)); 991 } 992 CeedCallBackend(CeedBasisDestroy(&basis)); 993 } 994 995 for (CeedInt i = 0; i < num_output_fields; i++) { 996 CeedBasis basis; 997 998 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 999 if (basis != CEED_BASIS_NONE) { 1000 bool is_tensor = true; 1001 const char *resource; 1002 char *resource_root; 1003 Ceed basis_ceed; 1004 1005 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 1006 is_all_tensor &= is_tensor; 1007 is_all_nontensor &= !is_tensor; 1008 1009 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 1010 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 1011 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 1012 has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared"); 1013 CeedCallBackend(CeedFree(&resource_root)); 1014 CeedCallBackend(CeedDestroy(&basis_ceed)); 1015 } 1016 CeedCallBackend(CeedBasisDestroy(&basis)); 1017 } 1018 // -- Fallback to ref if not all bases are shared 1019 if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) { 1020 *is_good_build = false; 1021 return CEED_ERROR_SUCCESS; 1022 } 1023 } 1024 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1025 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1026 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 1027 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 1028 1029 // Get operator data 1030 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 1031 CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields, 1032 qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices)); 1033 if (dim == 0) dim = 1; 1034 data->dim = dim; 1035 if (is_at_points) { 1036 CeedElemRestriction_Hip *rstr_data; 1037 CeedElemRestriction rstr_points = NULL; 1038 1039 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 1040 CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 1041 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 1042 CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data)); 1043 data->points.indices = (CeedInt *)rstr_data->d_offsets; 1044 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 1045 } 1046 if (is_at_points) use_3d_slices = false; 1047 if (Q_1d == 0) { 1048 if (is_at_points) Q_1d = max_num_points; 1049 else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d)); 1050 } 1051 data->Q_1d = Q_1d; 1052 1053 // Check for restriction only identity operator 1054 { 1055 bool is_identity_qf; 1056 1057 CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 1058 if (is_identity_qf) { 1059 CeedEvalMode eval_mode_in, eval_mode_out; 1060 1061 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); 1062 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); 1063 CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, 1064 "Backend does not implement restriction only identity operators"); 1065 } 1066 } 1067 1068 // Load basis source files 1069 if (is_tensor) { 1070 code << "// Tensor basis source\n"; 1071 code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 1072 } else { 1073 code << "// Non-tensor basis source\n"; 1074 code << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n"; 1075 } 1076 if (is_at_points) { 1077 code << "// AtPoints basis source\n"; 1078 code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n"; 1079 } 1080 code << "// CodeGen operator source\n"; 1081 code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 1082 1083 // Get QFunction name 1084 std::string qfunction_name(qf_data->qfunction_name); 1085 std::string operator_name; 1086 1087 operator_name = "CeedKernelHipGenOperator_" + qfunction_name; 1088 1089 // Define CEED_Q_VLA 1090 code << "\n#undef CEED_Q_VLA\n"; 1091 if (dim != 3 || is_at_points || use_3d_slices || !is_tensor) { 1092 code << "#define CEED_Q_VLA 1\n\n"; 1093 } else { 1094 code << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 1095 } 1096 1097 // Add user QFunction source 1098 { 1099 const char *source_path; 1100 1101 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 1102 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 1103 1104 code << "// User QFunction source\n"; 1105 code << "#include \"" << source_path << "\"\n\n"; 1106 } 1107 1108 // Setup 1109 code << "\n// -----------------------------------------------------------------------------\n"; 1110 code << "// Operator Kernel\n"; 1111 code << "// \n"; 1112 code << "// d_[in,out]_i: CeedVector device array\n"; 1113 code << "// r_[in,out]_e_i: Element vector register\n"; 1114 code << "// r_[in,out]_q_i: Quadrature space vector register\n"; 1115 code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 1116 code << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 1117 code << "// \n"; 1118 code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 1119 code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 1120 code << "// -----------------------------------------------------------------------------\n"; 1121 code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; 1122 code << "__global__ void " << operator_name 1123 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n"; 1124 1125 // Scratch buffers 1126 for (CeedInt i = 0; i < num_input_fields; i++) { 1127 CeedEvalMode eval_mode; 1128 1129 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1130 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 1131 code << " const CeedScalar *d_in_" << i << " = fields.inputs[" << i << "];\n"; 1132 } 1133 } 1134 for (CeedInt i = 0; i < num_output_fields; i++) { 1135 code << " CeedScalar *d_out_" << i << " = fields.outputs[" << i << "];\n"; 1136 } 1137 1138 code << " const CeedInt dim = " << dim << ";\n"; 1139 code << " const CeedInt " << (is_tensor ? "Q_1d" : "Q") << " = " << Q_1d << ";\n"; 1140 if (is_at_points) { 1141 code << " const CeedInt max_num_points = " << max_num_points << ";\n"; 1142 code << " const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 1143 } 1144 1145 // Shared data 1146 code << " extern __shared__ CeedScalar slice[];\n"; 1147 code << " SharedData_Hip data;\n"; 1148 code << " data.t_id_x = threadIdx.x;\n"; 1149 code << " data.t_id_y = threadIdx.y;\n"; 1150 code << " data.t_id_z = threadIdx.z;\n"; 1151 code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 1152 code << " data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n"; 1153 1154 // Initialize constants, and matrices B and G 1155 code << "\n // Input field constants and basis data\n"; 1156 for (CeedInt i = 0; i < num_input_fields; i++) { 1157 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_tensor, 1158 is_at_points, use_3d_slices)); 1159 } 1160 code << "\n // Output field constants and basis data\n"; 1161 for (CeedInt i = 0; i < num_output_fields; i++) { 1162 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor, 1163 is_at_points, use_3d_slices)); 1164 } 1165 1166 // Loop over all elements 1167 code << "\n // Element loop\n"; 1168 code << " __syncthreads();\n"; 1169 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 1170 1171 // -- Compute minimum buffer space needed 1172 CeedInt max_rstr_buffer_size = 1; 1173 1174 for (CeedInt i = 0; i < num_input_fields; i++) { 1175 CeedInt num_comp, elem_size; 1176 CeedElemRestriction elem_rstr; 1177 1178 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 1179 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1180 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 1181 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1)); 1182 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1183 } 1184 for (CeedInt i = 0; i < num_output_fields; i++) { 1185 CeedInt num_comp, elem_size; 1186 CeedElemRestriction elem_rstr; 1187 1188 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1189 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1190 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 1191 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_tensor && (dim >= 3) ? elem_size : 1)); 1192 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1193 } 1194 code << " // Scratch restriction buffer space\n"; 1195 code << " CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 1196 1197 // -- Determine best input field processing order 1198 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 1199 1200 for (CeedInt i = 0; i < num_input_fields; i++) { 1201 field_rstr_in_buffer[i] = -1; 1202 input_field_order[i] = -1; 1203 } 1204 { 1205 bool is_ordered[CEED_FIELD_MAX]; 1206 CeedInt curr_index = 0; 1207 1208 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 1209 for (CeedInt i = 0; i < num_input_fields; i++) { 1210 CeedVector vec_i; 1211 CeedElemRestriction rstr_i; 1212 1213 if (is_ordered[i]) continue; 1214 field_rstr_in_buffer[i] = i; 1215 is_ordered[i] = true; 1216 input_field_order[curr_index] = i; 1217 curr_index++; 1218 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 1219 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 1220 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 1221 for (CeedInt j = i + 1; j < num_input_fields; j++) { 1222 CeedVector vec_j; 1223 CeedElemRestriction rstr_j; 1224 1225 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 1226 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 1227 if (rstr_i == rstr_j && vec_i == vec_j) { 1228 field_rstr_in_buffer[j] = i; 1229 is_ordered[j] = true; 1230 input_field_order[curr_index] = j; 1231 curr_index++; 1232 } 1233 CeedCallBackend(CeedVectorDestroy(&vec_j)); 1234 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 1235 } 1236 CeedCallBackend(CeedVectorDestroy(&vec_i)); 1237 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 1238 } 1239 } 1240 1241 // -- Input restriction and basis 1242 code << "\n // -- Input field restrictions and basis actions\n"; 1243 for (CeedInt i = 0; i < num_input_fields; i++) { 1244 CeedInt f = input_field_order[i]; 1245 1246 code << " // ---- Input field " << f << "\n"; 1247 1248 // ---- Restriction 1249 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], Q_1d, 1250 true, is_tensor, is_at_points, use_3d_slices)); 1251 1252 // ---- Basis action 1253 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_tensor, 1254 is_at_points, use_3d_slices)); 1255 } 1256 1257 // -- Q function 1258 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields, 1259 num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_tensor, 1260 is_at_points, use_3d_slices)); 1261 1262 // -- Output basis and restriction 1263 code << "\n // -- Output field basis action and restrictions\n"; 1264 for (CeedInt i = 0; i < num_output_fields; i++) { 1265 code << " // ---- Output field " << i << "\n"; 1266 1267 // ---- Basis action 1268 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor, 1269 is_at_points, use_3d_slices)); 1270 1271 // ---- Restriction 1272 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, 1273 is_tensor, is_at_points, use_3d_slices)); 1274 } 1275 1276 // Close loop and function 1277 code << " }\n"; 1278 code << "}\n"; 1279 code << "// -----------------------------------------------------------------------------\n\n"; 1280 1281 CeedInt block_sizes[3] = {0, 0, 0}; 1282 CeedInt num_elem; 1283 1284 // Compile 1285 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 1286 CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, data->max_P_1d, Q_1d, block_sizes)); 1287 { 1288 bool is_compile_good = false; 1289 1290 CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE", 1291 block_sizes[0] * block_sizes[1] * block_sizes[2])); 1292 if (is_compile_good) { 1293 *is_good_build = true; 1294 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op)); 1295 } else { 1296 *is_good_build = false; 1297 data->use_fallback = true; 1298 } 1299 } 1300 CeedCallBackend(CeedOperatorSetSetupDone(op)); 1301 CeedCallBackend(CeedDestroy(&ceed)); 1302 CeedCallBackend(CeedQFunctionDestroy(&qf)); 1303 return CEED_ERROR_SUCCESS; 1304 } 1305 1306 //------------------------------------------------------------------------------ 1307