1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #define CEED_DEBUG_COLOR 12 9 10 #include <ceed.h> 11 #include <ceed/backend.h> 12 #include <ceed/jit-tools.h> 13 14 #include <iostream> 15 #include <sstream> 16 #include <string> 17 18 #include "../hip-ref/ceed-hip-ref.h" 19 #include "../hip-shared/ceed-hip-shared.h" 20 #include "../hip/ceed-hip-common.h" 21 #include "../hip/ceed-hip-compile.h" 22 #include "ceed-hip-gen.h" 23 24 //------------------------------------------------------------------------------ 25 // Calculate the block size used for launching the operator kernel 26 //------------------------------------------------------------------------------ 27 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { 28 const CeedInt thread1d = CeedIntMax(Q_1d, P_1d); 29 if (dim == 1) { 30 CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64; 31 32 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 33 block_sizes[0] = thread1d; 34 block_sizes[1] = 1; 35 block_sizes[2] = elems_per_block; 36 } else if (dim == 2) { 37 const CeedInt elems_per_block = thread1d < 4 ? 16 : 2; 38 39 block_sizes[0] = thread1d; 40 block_sizes[1] = thread1d; 41 block_sizes[2] = elems_per_block; 42 } else if (dim == 3) { 43 const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); 44 45 block_sizes[0] = thread1d; 46 block_sizes[1] = thread1d; 47 block_sizes[2] = elems_per_block; 48 } 49 return CEED_ERROR_SUCCESS; 50 } 51 52 //------------------------------------------------------------------------------ 53 // Determine type of operator 54 //------------------------------------------------------------------------------ 55 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 56 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields, 57 CeedQFunctionField *qf_output_fields, CeedInt *max_P_1d, CeedInt *Q_1d, CeedInt *dim, bool *is_tensor, 58 bool *use_3d_slices) { 59 // Find dim, P_1d, Q_1d 60 *max_P_1d = 0; 61 *Q_1d = 0; 62 *dim = 0; 63 *is_tensor = true; 64 for (CeedInt i = 0; i < num_input_fields; i++) { 65 CeedBasis basis; 66 67 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 68 if (basis != CEED_BASIS_NONE) { 69 bool is_field_tensor; 70 CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0; 71 72 // Collect dim, P_1d, and Q_1d 73 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 74 CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 75 *is_tensor = *is_tensor && is_field_tensor; 76 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 77 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 78 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 79 CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 80 *dim = field_dim; 81 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 82 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 83 *Q_1d = field_Q_1d; 84 } 85 } 86 for (CeedInt i = 0; i < num_output_fields; i++) { 87 CeedBasis basis; 88 89 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 90 if (basis != CEED_BASIS_NONE) { 91 bool is_field_tensor; 92 CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0; 93 94 // Collect dim, P_1d, and Q_1d 95 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 96 CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 97 *is_tensor = *is_tensor && is_field_tensor; 98 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 99 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 100 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 101 CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 102 *dim = field_dim; 103 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 104 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 105 *Q_1d = field_Q_1d; 106 } 107 } 108 109 // Only use 3D collocated gradient parallelization strategy when gradient is computed 110 *use_3d_slices = false; 111 if (*dim == 3) { 112 bool was_grad_found = false; 113 114 for (CeedInt i = 0; i < num_input_fields; i++) { 115 CeedEvalMode eval_mode; 116 117 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 118 if (eval_mode == CEED_EVAL_GRAD) { 119 CeedBasis_Hip_shared *basis_data; 120 CeedBasis basis; 121 122 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 123 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 124 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 125 was_grad_found = true; 126 } 127 } 128 for (CeedInt i = 0; i < num_output_fields; i++) { 129 CeedEvalMode eval_mode; 130 131 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 132 if (eval_mode == CEED_EVAL_GRAD) { 133 CeedBasis_Hip_shared *basis_data; 134 CeedBasis basis; 135 136 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 137 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 138 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 139 was_grad_found = true; 140 } 141 } 142 } 143 return CEED_ERROR_SUCCESS; 144 } 145 146 //------------------------------------------------------------------------------ 147 // Setup fields 148 //------------------------------------------------------------------------------ 149 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedOperatorField op_field, 150 CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool use_3d_slices) { 151 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 152 std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d"; 153 std::string option_name = (is_input ? "inputs" : "outputs"); 154 CeedEvalMode eval_mode = CEED_EVAL_NONE; 155 CeedInt elem_size = 0, num_comp = 0, P_1d = 0; 156 CeedElemRestriction elem_rstr; 157 CeedBasis_Hip_shared *basis_data; 158 CeedBasis basis; 159 160 code << " // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n"; 161 162 // Get field data 163 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 164 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 165 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 166 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 167 } 168 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 169 if (basis != CEED_BASIS_NONE) { 170 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 171 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 172 } 173 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 174 175 // Set field constants 176 if (eval_mode != CEED_EVAL_WEIGHT) { 177 code << " const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n"; 178 code << " const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; 179 } 180 181 // Load basis data 182 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 183 switch (eval_mode) { 184 case CEED_EVAL_NONE: 185 break; 186 case CEED_EVAL_INTERP: 187 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 188 else data->B.outputs[i] = basis_data->d_interp_1d; 189 code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n"; 190 code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 191 break; 192 case CEED_EVAL_GRAD: 193 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 194 else data->B.outputs[i] = basis_data->d_interp_1d; 195 code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n"; 196 code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 197 if (use_3d_slices) { 198 if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d; 199 else data->G.outputs[i] = basis_data->d_collo_grad_1d; 200 code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n"; 201 code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 202 } else { 203 bool has_collo_grad = basis_data->d_collo_grad_1d; 204 205 if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 206 else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 207 if (has_collo_grad) { 208 code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n"; 209 code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 210 } else { 211 code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * P_1d << "];\n"; 212 code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 213 } 214 } 215 break; 216 case CEED_EVAL_WEIGHT: 217 break; // No action 218 // LCOV_EXCL_START 219 case CEED_EVAL_DIV: 220 break; // TODO: Not implemented 221 case CEED_EVAL_CURL: 222 break; // TODO: Not implemented 223 // LCOV_EXCL_STOP 224 } 225 return CEED_ERROR_SUCCESS; 226 } 227 228 //------------------------------------------------------------------------------ 229 // Restriction 230 //------------------------------------------------------------------------------ 231 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim, 232 CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, 233 CeedInt Q_1d, bool is_input, bool use_3d_slices) { 234 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 235 std::string P_name = "P_1d" + var_suffix; 236 CeedEvalMode eval_mode = CEED_EVAL_NONE; 237 CeedInt elem_size = 0, num_comp = 0, P_1d = 0; 238 CeedSize l_size; 239 CeedRestrictionType rstr_type = CEED_RESTRICTION_STANDARD; 240 CeedElemRestriction_Hip *rstr_data; 241 CeedElemRestriction elem_rstr; 242 CeedBasis basis; 243 244 // Get field data 245 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 246 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 247 CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); 248 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 249 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 250 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 251 } 252 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 253 if (basis != CEED_BASIS_NONE) { 254 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 255 } 256 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 257 258 // Restriction 259 if (is_input) { 260 // Input 261 if (field_input_buffer[i] != i) { 262 std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]); 263 264 // Restriction was already done for previous input 265 code << " CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n"; 266 } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) { 267 if (eval_mode == CEED_EVAL_NONE) { 268 // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated 269 code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n"; 270 } else { 271 // Otherwise we're using the scratch space 272 code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 273 } 274 switch (rstr_type) { 275 case CEED_RESTRICTION_STANDARD: { 276 CeedInt comp_stride; 277 278 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 279 code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 280 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 281 code << " // CompStride: " << comp_stride << "\n"; 282 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 283 code << " ReadLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" 284 << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n"; 285 break; 286 } 287 case CEED_RESTRICTION_STRIDED: { 288 bool has_backend_strides; 289 CeedInt num_elem; 290 291 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 292 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 293 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 294 295 if (!has_backend_strides) { 296 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 297 } 298 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 299 code << " ReadLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << "," 300 << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n"; 301 break; 302 } 303 // LCOV_EXCL_START 304 case CEED_RESTRICTION_ORIENTED: 305 case CEED_RESTRICTION_CURL_ORIENTED: 306 case CEED_RESTRICTION_POINTS: 307 break; // TODO: Not implemented 308 // LCOV_EXCL_STOP 309 } 310 } 311 } else { 312 // Output 313 switch (rstr_type) { 314 case CEED_RESTRICTION_STANDARD: { 315 CeedInt comp_stride; 316 317 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 318 code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 319 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 320 code << " // CompStride: " << comp_stride << "\n"; 321 data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 322 code << " WriteLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" 323 << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n"; 324 break; 325 } 326 case CEED_RESTRICTION_STRIDED: { 327 bool has_backend_strides; 328 CeedInt num_elem; 329 330 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 331 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 332 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 333 334 if (!has_backend_strides) { 335 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 336 } 337 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 338 code << " WriteLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << "," 339 << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n"; 340 break; 341 } 342 // LCOV_EXCL_START 343 case CEED_RESTRICTION_ORIENTED: 344 case CEED_RESTRICTION_CURL_ORIENTED: 345 case CEED_RESTRICTION_POINTS: 346 break; // TODO: Not implemented 347 // LCOV_EXCL_STOP 348 } 349 } 350 return CEED_ERROR_SUCCESS; 351 } 352 353 //------------------------------------------------------------------------------ 354 // Basis 355 //------------------------------------------------------------------------------ 356 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim, 357 CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, 358 bool use_3d_slices) { 359 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 360 std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d"; 361 CeedEvalMode eval_mode = CEED_EVAL_NONE; 362 CeedInt elem_size = 0, num_comp = 0, P_1d = 0; 363 CeedElemRestriction elem_rstr; 364 CeedBasis basis; 365 366 // Get field data 367 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 368 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 369 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 370 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 371 } 372 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 373 if (basis != CEED_BASIS_NONE) { 374 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 375 } 376 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 377 378 // Basis 379 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 380 if (is_input) { 381 switch (eval_mode) { 382 case CEED_EVAL_NONE: 383 if (!use_3d_slices) { 384 code << " CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n"; 385 } 386 break; 387 case CEED_EVAL_INTERP: 388 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 389 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name 390 << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 391 break; 392 case CEED_EVAL_GRAD: 393 if (use_3d_slices) { 394 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 395 code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name 396 << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 397 } else { 398 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n"; 399 code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp" << var_suffix 400 << ", P_1d" << var_suffix << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" 401 << var_suffix << ");\n"; 402 } 403 break; 404 case CEED_EVAL_WEIGHT: { 405 CeedBasis_Hip_shared *basis_data; 406 407 code << " CeedScalar r_q" << var_suffix << "[" << Q_name << "];\n"; 408 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 409 data->W = basis_data->d_q_weight_1d; 410 code << " Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n"; 411 break; 412 } 413 // LCOV_EXCL_START 414 case CEED_EVAL_DIV: 415 case CEED_EVAL_CURL: 416 break; // TODO: Not implemented 417 // LCOV_EXCL_STOP 418 } 419 } else { 420 switch (eval_mode) { 421 case CEED_EVAL_NONE: 422 code << " CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n"; 423 break; // No action 424 case CEED_EVAL_INTERP: 425 code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 426 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name 427 << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 428 break; 429 case CEED_EVAL_GRAD: 430 code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 431 if (use_3d_slices) { 432 code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name 433 << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 434 } else { 435 code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp" 436 << var_suffix << ", " << P_name << "," << Q_name << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix 437 << ", r_e" << var_suffix << ");\n"; 438 } 439 break; 440 // LCOV_EXCL_START 441 case CEED_EVAL_WEIGHT: 442 break; // Should not occur 443 case CEED_EVAL_DIV: 444 case CEED_EVAL_CURL: 445 break; // TODO: Not implemented 446 // LCOV_EXCL_STOP 447 } 448 } 449 return CEED_ERROR_SUCCESS; 450 } 451 452 //------------------------------------------------------------------------------ 453 // QFunction 454 //------------------------------------------------------------------------------ 455 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt dim, CeedInt num_input_fields, 456 CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields, 457 CeedInt num_output_fields, CeedOperatorField *op_output_fields, 458 CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d, 459 bool use_3d_slices) { 460 std::string Q_name = "Q_1d"; 461 CeedEvalMode eval_mode = CEED_EVAL_NONE; 462 CeedElemRestriction elem_rstr; 463 464 // Setup output arays 465 code << "\n // -- Output field setup\n"; 466 for (CeedInt i = 0; i < num_output_fields; i++) { 467 std::string var_suffix = "_out_" + std::to_string(i); 468 469 code << " // ---- Output field " << i << "\n"; 470 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 471 if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) { 472 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 473 } 474 if (eval_mode == CEED_EVAL_GRAD) { 475 if (use_3d_slices) { 476 // Accumulator for gradient slices 477 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 478 code << " for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n"; 479 code << " r_q" << var_suffix << "[i] = 0.0;\n"; 480 code << " }\n"; 481 } else { 482 code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n"; 483 } 484 } 485 } 486 487 // We treat quadrature points per slice in 3d to save registers 488 if (use_3d_slices) { 489 code << "\n // Note: Using planes of 3D elements\n"; 490 code << " #pragma unroll\n"; 491 code << " for (CeedInt q = 0; q < " << Q_name << "; q++) {\n"; 492 code << " // -- Input fields\n"; 493 for (CeedInt i = 0; i < num_input_fields; i++) { 494 std::string var_suffix = "_in_" + std::to_string(i); 495 496 code << " // ---- Input field " << i << "\n"; 497 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 498 // Basis action 499 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 500 switch (eval_mode) { 501 case CEED_EVAL_NONE: 502 bool is_strided; 503 504 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 505 506 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 507 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 508 if (is_strided) { 509 bool has_backend_strides; 510 CeedInt num_elem, elem_size; 511 512 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 513 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 514 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 515 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 516 517 if (!has_backend_strides) { 518 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 519 } 520 code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 521 code << " ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << "," << strides[0] << "," << strides[1] << "," 522 << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n"; 523 } else { 524 CeedSize l_size = 0; 525 CeedInt comp_stride; 526 CeedElemRestriction_Hip *rstr_data; 527 528 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 529 code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 530 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 531 code << " // CompStride: " << comp_stride << "\n"; 532 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 533 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 534 code << " ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix 535 << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 536 } 537 break; 538 case CEED_EVAL_INTERP: 539 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 540 code << " for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n"; 541 code << " r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n"; 542 code << " }\n"; 543 break; 544 case CEED_EVAL_GRAD: 545 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 546 code << " GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix 547 << ", r_s" << var_suffix << ");\n"; 548 break; 549 case CEED_EVAL_WEIGHT: 550 code << " CeedScalar r_s" << var_suffix << "[1];\n"; 551 code << " r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n"; 552 break; // No action 553 // LCOV_EXCL_START 554 case CEED_EVAL_DIV: 555 break; // TODO: Not implemented 556 case CEED_EVAL_CURL: 557 break; // TODO: Not implemented 558 // LCOV_EXCL_STOP 559 } 560 } 561 code << "\n // -- Output fields\n"; 562 for (CeedInt i = 0; i < num_output_fields; i++) { 563 std::string var_suffix = "_out_" + std::to_string(i); 564 565 code << " // ---- Output field " << i << "\n"; 566 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 567 // Basis action 568 switch (eval_mode) { 569 case CEED_EVAL_NONE: 570 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 571 break; // No action 572 case CEED_EVAL_INTERP: 573 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 574 break; 575 case CEED_EVAL_GRAD: 576 code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 577 break; 578 // LCOV_EXCL_START 579 case CEED_EVAL_WEIGHT: 580 break; // Should not occur 581 case CEED_EVAL_DIV: 582 break; // TODO: Not implemented 583 case CEED_EVAL_CURL: 584 break; // TODO: Not implemented 585 // LCOV_EXCL_STOP 586 } 587 } 588 } else { 589 code << "\n // Note: Using full elements\n"; 590 code << " {\n"; 591 code << " // -- Input fields\n"; 592 for (CeedInt i = 0; i < num_input_fields; i++) { 593 code << " // ---- Input field " << i << "\n"; 594 code << " CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n"; 595 } 596 code << " // -- Output fields\n"; 597 for (CeedInt i = 0; i < num_output_fields; i++) { 598 code << " // ---- Output field " << i << "\n"; 599 code << " CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n"; 600 } 601 } 602 603 // Input and output buffers 604 code << "\n // -- QFunction inputs and outputs\n"; 605 code << " // ---- Inputs\n"; 606 code << " CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; 607 for (CeedInt i = 0; i < num_input_fields; i++) { 608 code << " // ------ Input field " << i << "\n"; 609 code << " inputs[" << i << "] = r_s_in_" << i << ";\n"; 610 } 611 code << " // ---- Outputs\n"; 612 code << " CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; 613 for (CeedInt i = 0; i < num_output_fields; i++) { 614 code << " // ------ Output field " << i << "\n"; 615 code << " outputs[" << i << "] = r_s_out_" << i << ";\n"; 616 } 617 618 // Apply QFunction 619 code << "\n // -- Apply QFunction\n"; 620 code << " " << qfunction_name << "(ctx, "; 621 if (dim != 3 || use_3d_slices) { 622 code << "1"; 623 } else { 624 code << "Q_1d"; 625 } 626 code << ", inputs, outputs);\n"; 627 628 // Copy or apply transpose grad, if needed 629 if (use_3d_slices) { 630 code << " // -- Output fields\n"; 631 for (CeedInt i = 0; i < num_output_fields; i++) { 632 std::string var_suffix = "_out_" + std::to_string(i); 633 std::string P_name = "P_1d" + var_suffix; 634 635 code << " // ---- Output field " << i << "\n"; 636 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 637 // Basis action 638 code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 639 switch (eval_mode) { 640 case CEED_EVAL_NONE: 641 code << " for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 642 code << " r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 643 code << " }\n"; 644 break; // No action 645 case CEED_EVAL_INTERP: 646 code << " for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 647 code << " r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 648 code << " }\n"; 649 break; 650 case CEED_EVAL_GRAD: 651 code << " GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G" 652 << var_suffix << ", r_q" << var_suffix << ");\n"; 653 break; 654 // LCOV_EXCL_START 655 case CEED_EVAL_WEIGHT: 656 break; // Should not occur 657 case CEED_EVAL_DIV: 658 break; // TODO: Not implemented 659 case CEED_EVAL_CURL: 660 break; // TODO: Not implemented 661 // LCOV_EXCL_STOP 662 } 663 } 664 } 665 code << " }\n"; 666 return CEED_ERROR_SUCCESS; 667 } 668 669 //------------------------------------------------------------------------------ 670 // Build single operator kernel 671 //------------------------------------------------------------------------------ 672 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { 673 bool is_tensor = true, use_3d_slices = false; 674 Ceed ceed; 675 CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1; 676 CeedQFunctionField *qf_input_fields, *qf_output_fields; 677 CeedQFunction_Hip_gen *qf_data; 678 CeedQFunction qf; 679 CeedOperatorField *op_input_fields, *op_output_fields; 680 CeedOperator_Hip_gen *data; 681 std::ostringstream code; 682 683 { 684 bool is_setup_done; 685 686 CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 687 if (is_setup_done) return CEED_ERROR_SUCCESS; 688 } 689 690 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 691 CeedCallBackend(CeedOperatorGetData(op, &data)); 692 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 693 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 694 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 695 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 696 697 // Get operator data 698 CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields, 699 qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices)); 700 if (dim == 0) dim = 1; 701 data->dim = dim; 702 if (Q_1d == 0) { 703 CeedInt Q; 704 705 CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q)); 706 Q_1d = Q; 707 } 708 data->Q_1d = Q_1d; 709 710 // Check for restriction only identity operator 711 { 712 bool is_identity_qf; 713 714 CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 715 if (is_identity_qf) { 716 CeedEvalMode eval_mode_in, eval_mode_out; 717 718 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); 719 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); 720 CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, 721 "Backend does not implement restriction only identity operators"); 722 } 723 } 724 725 // Load basis source files 726 // TODO: Add non-tensor, AtPoints 727 code << "// Tensor basis source\n"; 728 code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 729 code << "// CodeGen operator source\n"; 730 code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 731 732 // Get QFunction name 733 std::string qfunction_name(qf_data->qfunction_name); 734 std::string operator_name; 735 736 operator_name = "CeedKernelHipGenOperator_" + qfunction_name; 737 738 // Define CEED_Q_VLA 739 code << "\n#undef CEED_Q_VLA\n"; 740 if (dim != 3 || use_3d_slices) { 741 code << "#define CEED_Q_VLA 1\n\n"; 742 } else { 743 code << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 744 } 745 746 // Add user QFunction source 747 { 748 const char *source_path; 749 750 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 751 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 752 753 code << "// User QFunction source\n"; 754 code << "#include \"" << source_path << "\"\n\n"; 755 } 756 757 // Setup 758 code << "\n// -----------------------------------------------------------------------------\n"; 759 code << "// Operator Kernel\n"; 760 code << "// \n"; 761 code << "// d_[in,out]_i: CeedVector device array\n"; 762 code << "// r_[in,out]_e_i: Element vector register\n"; 763 code << "// r_[in,out]_q_i: Quadrature space vector register\n"; 764 code << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 765 code << "// \n"; 766 code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 767 code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 768 code << "// -----------------------------------------------------------------------------\n"; 769 code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; 770 code << "__global__ void " << operator_name 771 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W) {\n"; 772 773 // Scratch buffers 774 for (CeedInt i = 0; i < num_input_fields; i++) { 775 CeedEvalMode eval_mode; 776 777 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 778 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 779 code << " const CeedScalar *d_in_" << i << " = fields.inputs[" << i << "];\n"; 780 } 781 } 782 for (CeedInt i = 0; i < num_output_fields; i++) { 783 code << " CeedScalar *d_out_" << i << " = fields.outputs[" << i << "];\n"; 784 } 785 786 code << " const CeedInt dim = " << dim << ";\n"; 787 code << " const CeedInt Q_1d = " << Q_1d << ";\n"; 788 789 // Shared data 790 code << " extern __shared__ CeedScalar slice[];\n"; 791 code << " SharedData_Hip data;\n"; 792 code << " data.t_id_x = threadIdx.x;\n"; 793 code << " data.t_id_y = threadIdx.y;\n"; 794 code << " data.t_id_z = threadIdx.z;\n"; 795 code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 796 code << " data.slice = slice + data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n"; 797 798 // Initialize constants, and matrices B and G 799 code << "\n // Input field constants and basis data\n"; 800 for (CeedInt i = 0; i < num_input_fields; i++) { 801 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices)); 802 } 803 code << "\n // Output field constants and basis data\n"; 804 for (CeedInt i = 0; i < num_output_fields; i++) { 805 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices)); 806 } 807 808 // Loop over all elements 809 code << "\n // Element loop\n"; 810 code << " __syncthreads();\n"; 811 code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 812 813 // -- Compute minimum buffer space needed 814 CeedInt max_rstr_buffer_size = 0; 815 816 for (CeedInt i = 0; i < num_input_fields; i++) { 817 CeedInt num_comp, elem_size; 818 CeedElemRestriction elem_rstr; 819 820 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 821 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 822 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 823 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size); 824 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 825 } 826 for (CeedInt i = 0; i < num_output_fields; i++) { 827 CeedInt num_comp, elem_size; 828 CeedElemRestriction elem_rstr; 829 830 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 831 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 832 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 833 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size); 834 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 835 } 836 code << " // Scratch restriction buffer space\n"; 837 code << " CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 838 839 // -- Determine best input field processing order 840 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 841 842 for (CeedInt i = 0; i < num_input_fields; i++) { 843 field_rstr_in_buffer[i] = -1; 844 input_field_order[i] = -1; 845 } 846 { 847 bool is_ordered[CEED_FIELD_MAX]; 848 CeedInt curr_index = 0; 849 850 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 851 for (CeedInt i = 0; i < num_input_fields; i++) { 852 CeedVector vec_i; 853 CeedElemRestriction rstr_i; 854 855 if (is_ordered[i]) continue; 856 field_rstr_in_buffer[i] = i; 857 is_ordered[i] = true; 858 input_field_order[curr_index] = i; 859 curr_index++; 860 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 861 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 862 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 863 for (CeedInt j = i + 1; j < num_input_fields; j++) { 864 CeedVector vec_j; 865 CeedElemRestriction rstr_j; 866 867 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 868 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 869 if (rstr_i == rstr_j && vec_i == vec_j) { 870 field_rstr_in_buffer[j] = i; 871 is_ordered[j] = true; 872 input_field_order[curr_index] = j; 873 curr_index++; 874 } 875 } 876 } 877 } 878 879 // -- Input restriction and basis 880 code << " // -- Input field restrictions and basis actions\n"; 881 for (CeedInt i = 0; i < num_input_fields; i++) { 882 CeedInt f = input_field_order[i]; 883 884 code << " // ---- Input field " << f << "\n"; 885 886 // ---- Restriction 887 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], Q_1d, 888 true, use_3d_slices)); 889 890 // ---- Basis action 891 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, use_3d_slices)); 892 } 893 894 // -- Q function 895 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, 896 op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices)); 897 898 // -- Output basis and restriction 899 code << "\n // -- Output field basis action and restrictions\n"; 900 for (CeedInt i = 0; i < num_output_fields; i++) { 901 code << " // ---- Output field " << i << "\n"; 902 903 // ---- Basis action 904 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices)); 905 906 // ---- Restriction 907 CeedCallBackend( 908 CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices)); 909 } 910 911 // Close loop and function 912 code << " }\n"; 913 code << "}\n"; 914 code << "// -----------------------------------------------------------------------------\n\n"; 915 916 CeedInt block_sizes[3] = {0, 0, 0}; 917 CeedInt num_elem; 918 919 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 920 CeedCallBackend(BlockGridCalculate_Hip_gen(dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); 921 CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE", 922 block_sizes[0] * block_sizes[1] * block_sizes[2])); 923 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op)); 924 CeedCallBackend(CeedOperatorSetSetupDone(op)); 925 CeedCallBackend(CeedDestroy(&ceed)); 926 CeedCallBackend(CeedQFunctionDestroy(&qf)); 927 return CEED_ERROR_SUCCESS; 928 } 929 930 //------------------------------------------------------------------------------ 931