1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #define CEED_DEBUG_COLOR 12 9 10 #include <ceed.h> 11 #include <ceed/backend.h> 12 #include <ceed/gen-tools.h> 13 #include <ceed/jit-tools.h> 14 15 #include <iostream> 16 #include <sstream> 17 #include <string> 18 19 #include "../hip-ref/ceed-hip-ref.h" 20 #include "../hip-shared/ceed-hip-shared.h" 21 #include "../hip/ceed-hip-common.h" 22 #include "../hip/ceed-hip-compile.h" 23 #include "ceed-hip-gen.h" 24 25 struct FieldReuse_Hip { 26 CeedInt index; 27 bool is_input; 28 CeedEvalMode eval_mode; 29 }; 30 31 //------------------------------------------------------------------------------ 32 // Calculate the block size used for launching the operator kernel 33 //------------------------------------------------------------------------------ 34 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { 35 const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 36 if (dim == 1) { 37 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 38 39 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 40 block_sizes[0] = thread_1d; 41 block_sizes[1] = 1; 42 block_sizes[2] = elems_per_block; 43 } else if (dim == 2) { 44 const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2; 45 46 block_sizes[0] = thread_1d; 47 block_sizes[1] = thread_1d; 48 block_sizes[2] = elems_per_block; 49 } else if (dim == 3) { 50 const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1); 51 52 block_sizes[0] = thread_1d; 53 block_sizes[1] = thread_1d; 54 block_sizes[2] = elems_per_block; 55 } 56 return CEED_ERROR_SUCCESS; 57 } 58 59 //------------------------------------------------------------------------------ 60 // Determine type of operator 61 //------------------------------------------------------------------------------ 62 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 63 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields, 64 CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d, 65 CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) { 66 // Check if all are tensor 67 *is_all_tensor = true; 68 for (CeedInt i = 0; i < num_input_fields; i++) { 69 CeedBasis basis; 70 71 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 72 if (basis != CEED_BASIS_NONE) { 73 bool is_field_tensor; 74 75 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 76 *is_all_tensor = *is_all_tensor && is_field_tensor; 77 } 78 CeedCallBackend(CeedBasisDestroy(&basis)); 79 } 80 for (CeedInt i = 0; i < num_output_fields; i++) { 81 CeedBasis basis; 82 83 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 84 if (basis != CEED_BASIS_NONE) { 85 bool is_field_tensor; 86 87 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 88 *is_all_tensor = *is_all_tensor && is_field_tensor; 89 } 90 CeedCallBackend(CeedBasisDestroy(&basis)); 91 } 92 93 // Find max_P, max_P_1d, Q, and Q_1d 94 bool is_all_3d = true; 95 96 *max_P = 0; 97 *max_P_1d = 0; 98 *Q = 0; 99 *Q_1d = 0; 100 for (CeedInt i = 0; i < num_input_fields; i++) { 101 CeedBasis basis; 102 103 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 104 if (basis != CEED_BASIS_NONE) { 105 bool is_field_tensor; 106 CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; 107 108 // Check if 3D 109 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 110 is_all_3d = is_all_3d && (field_dim == 3); 111 *max_dim = CeedIntMax(*max_dim, field_dim); 112 113 // Collect P, P_1d, Q, and Q_1d 114 CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); 115 *max_P = CeedIntMax(*max_P, field_P); 116 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 117 if (is_field_tensor) { 118 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 119 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 120 } 121 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); 122 CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 123 *Q = field_Q; 124 if (is_field_tensor) { 125 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 126 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 127 *Q_1d = field_Q_1d; 128 } 129 } 130 CeedCallBackend(CeedBasisDestroy(&basis)); 131 } 132 for (CeedInt i = 0; i < num_output_fields; i++) { 133 CeedBasis basis; 134 135 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 136 if (basis != CEED_BASIS_NONE) { 137 bool is_field_tensor; 138 CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; 139 140 // Check if 3D 141 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 142 is_all_3d = is_all_3d && (field_dim == 3); 143 *max_dim = CeedIntMax(*max_dim, field_dim); 144 145 // Collect P, P_1d, Q, and Q_1d 146 CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); 147 *max_P = CeedIntMax(*max_P, field_P); 148 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 149 if (is_field_tensor) { 150 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 151 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 152 } 153 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); 154 CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 155 *Q = field_Q; 156 if (is_field_tensor) { 157 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 158 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 159 *Q_1d = field_Q_1d; 160 } 161 } 162 CeedCallBackend(CeedBasisDestroy(&basis)); 163 } 164 165 // Only use 3D collocated gradient parallelization strategy when gradient is computed 166 *use_3d_slices = false; 167 if (is_all_3d && *is_all_tensor) { 168 bool was_grad_found = false; 169 170 for (CeedInt i = 0; i < num_input_fields; i++) { 171 CeedEvalMode eval_mode; 172 173 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 174 if (eval_mode == CEED_EVAL_GRAD) { 175 CeedBasis_Hip_shared *basis_data; 176 CeedBasis basis; 177 178 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 179 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 180 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 181 was_grad_found = true; 182 CeedCallBackend(CeedBasisDestroy(&basis)); 183 } 184 } 185 for (CeedInt i = 0; i < num_output_fields; i++) { 186 CeedEvalMode eval_mode; 187 188 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 189 if (eval_mode == CEED_EVAL_GRAD) { 190 CeedBasis_Hip_shared *basis_data; 191 CeedBasis basis; 192 193 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 194 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 195 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 196 was_grad_found = true; 197 CeedCallBackend(CeedBasisDestroy(&basis)); 198 } 199 } 200 } 201 return CEED_ERROR_SUCCESS; 202 } 203 204 //------------------------------------------------------------------------------ 205 // Setup fields 206 //------------------------------------------------------------------------------ 207 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, 208 CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse, 209 CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, 210 bool use_3d_slices, bool skip_active_load) { 211 bool is_tensor = true, is_active = true; 212 CeedBasis basis; 213 214 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 215 if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 216 { 217 CeedVector vec; 218 219 CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec)); 220 is_active = vec == CEED_VECTOR_ACTIVE; 221 CeedCallBackend(CeedVectorDestroy(&vec)); 222 } 223 224 const char *field_name; 225 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 226 std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 227 std::string option_name = (is_input ? "inputs" : "outputs"); 228 CeedEvalMode eval_mode = CEED_EVAL_NONE; 229 CeedInt elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0; 230 CeedElemRestriction elem_rstr; 231 CeedBasis_Hip_shared *basis_data; 232 233 // Field reuse info 234 bool use_previous_field = field_reuse.index != -1; 235 236 CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name)); 237 code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n"; 238 239 // Get field data 240 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 241 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 242 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 243 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 244 } 245 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 246 if (basis != CEED_BASIS_NONE) { 247 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 248 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 249 if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 250 else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 251 } 252 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 253 254 // Set field constants 255 code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n"; 256 if (is_tensor && !is_all_tensor) { 257 CeedInt P = 0; 258 259 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 260 code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n"; 261 } 262 code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n"; 263 if (eval_mode != CEED_EVAL_WEIGHT) { 264 code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; 265 } 266 267 // Load basis data 268 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 269 switch (eval_mode) { 270 case CEED_EVAL_NONE: 271 break; 272 case CEED_EVAL_INTERP: 273 if (is_at_points) { 274 // AtPoints 275 if (!basis_data->d_chebyshev_interp_1d) { 276 CeedSize interp_bytes; 277 CeedScalar *chebyshev_interp_1d; 278 279 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 280 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 281 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 282 CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 283 CeedCallHip(CeedBasisReturnCeed(basis), 284 hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 285 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 286 } 287 if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 288 else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 289 } else { 290 // Standard quadrature 291 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 292 else data->B.outputs[i] = basis_data->d_interp_1d; 293 } 294 if (use_previous_field && !skip_active_load) { 295 std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 296 297 code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; 298 } else { 299 bool is_collocated = false; 300 301 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 302 if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { 303 code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; 304 } else { 305 code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 306 code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 307 } 308 } 309 break; 310 case CEED_EVAL_GRAD: 311 if (is_at_points) { 312 // AtPoints 313 if (!basis_data->d_chebyshev_interp_1d) { 314 CeedSize interp_bytes; 315 CeedScalar *chebyshev_interp_1d; 316 317 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 318 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 319 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 320 CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 321 CeedCallHip(CeedBasisReturnCeed(basis), 322 hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 323 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 324 } 325 if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 326 else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 327 } else { 328 // Standard quadrature 329 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 330 else data->B.outputs[i] = basis_data->d_interp_1d; 331 } 332 if (is_tensor) { 333 if (use_previous_field && !skip_active_load) { 334 std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 335 336 code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; 337 } else { 338 bool is_collocated = false; 339 340 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 341 if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { 342 code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; 343 } else { 344 code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 345 code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 346 } 347 } 348 } 349 if (is_at_points) break; // No G mat for AtPoints 350 if (use_3d_slices) { 351 if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d; 352 else data->G.outputs[i] = basis_data->d_collo_grad_1d; 353 if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 354 std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 355 356 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 357 } else if (is_active && skip_active_load) { 358 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 359 } else { 360 code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 361 code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 362 } 363 } else { 364 bool has_collo_grad = basis_data->d_collo_grad_1d; 365 366 if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 367 else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 368 if (has_collo_grad) { 369 if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 370 std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 371 372 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 373 } else if (is_active && skip_active_load) { 374 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 375 } else { 376 code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 377 code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 378 } 379 } else { 380 if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 381 std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 382 383 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 384 } else if (is_active && skip_active_load) { 385 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 386 } else { 387 code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") 388 << (is_tensor ? "" : var_suffix) << "];\n"; 389 code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G." 390 << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 391 } 392 } 393 } 394 break; 395 case CEED_EVAL_WEIGHT: 396 break; // No action 397 // LCOV_EXCL_START 398 case CEED_EVAL_DIV: 399 case CEED_EVAL_CURL: 400 break; // TODO: Not implemented 401 // LCOV_EXCL_STOP 402 } 403 CeedCallBackend(CeedBasisDestroy(&basis)); 404 return CEED_ERROR_SUCCESS; 405 } 406 407 //------------------------------------------------------------------------------ 408 // Restriction 409 //------------------------------------------------------------------------------ 410 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, 411 CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, 412 CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, 413 bool use_3d_slices) { 414 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 415 std::string P_name = (is_all_tensor ? "P_1d" : "P") + var_suffix; 416 CeedEvalMode eval_mode = CEED_EVAL_NONE; 417 CeedInt elem_size = 0, num_comp = 0; 418 CeedSize l_size; 419 CeedRestrictionType rstr_type = CEED_RESTRICTION_STANDARD; 420 CeedElemRestriction_Hip *rstr_data; 421 CeedElemRestriction elem_rstr; 422 423 // Get field data 424 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 425 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 426 CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); 427 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 428 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 429 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 430 } 431 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 432 433 // Restriction 434 if (is_input) { 435 // Input 436 if (field_input_buffer[i] != i) { 437 std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]); 438 439 // Restriction was already done for previous input 440 code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n"; 441 } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) { 442 if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) { 443 // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated 444 code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n"; 445 } else if (rstr_type != CEED_RESTRICTION_POINTS) { 446 // Otherwise we're using the scratch space 447 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 448 } 449 switch (rstr_type) { 450 case CEED_RESTRICTION_STANDARD: { 451 CeedInt comp_stride; 452 453 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 454 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 455 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 456 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 457 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 458 code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " 459 << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix 460 << ");\n"; 461 break; 462 } 463 case CEED_RESTRICTION_STRIDED: { 464 bool has_backend_strides; 465 CeedInt num_elem; 466 467 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 468 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 469 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 470 471 if (!has_backend_strides) { 472 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 473 } 474 code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 475 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 476 code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides" 477 << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e" 478 << var_suffix << ");\n"; 479 break; 480 } 481 case CEED_RESTRICTION_POINTS: { 482 CeedInt comp_stride; 483 484 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 485 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 486 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 487 break; 488 } 489 // LCOV_EXCL_START 490 case CEED_RESTRICTION_ORIENTED: 491 case CEED_RESTRICTION_CURL_ORIENTED: 492 break; // TODO: Not implemented 493 // LCOV_EXCL_STOP 494 } 495 } 496 } else { 497 // Output 498 switch (rstr_type) { 499 case CEED_RESTRICTION_STANDARD: { 500 CeedInt comp_stride; 501 502 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 503 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 504 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 505 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 506 data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 507 code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " 508 << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix 509 << ");\n"; 510 break; 511 } 512 case CEED_RESTRICTION_STRIDED: { 513 bool has_backend_strides; 514 CeedInt num_elem; 515 516 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 517 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 518 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 519 520 if (!has_backend_strides) { 521 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 522 } 523 code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 524 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 525 code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides" 526 << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix 527 << ");\n"; 528 break; 529 } 530 case CEED_RESTRICTION_POINTS: 531 data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 532 break; 533 // LCOV_EXCL_START 534 case CEED_RESTRICTION_ORIENTED: 535 case CEED_RESTRICTION_CURL_ORIENTED: 536 break; // TODO: Not implemented 537 // LCOV_EXCL_STOP 538 } 539 } 540 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 541 return CEED_ERROR_SUCCESS; 542 } 543 544 //------------------------------------------------------------------------------ 545 // Basis 546 //------------------------------------------------------------------------------ 547 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field, 548 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, 549 bool is_at_points, bool use_3d_slices) { 550 bool is_tensor = true, is_collocated = true; 551 CeedBasis basis; 552 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 553 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 554 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 555 556 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 557 std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 558 CeedEvalMode eval_mode = CEED_EVAL_NONE; 559 CeedInt dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0; 560 CeedElemRestriction elem_rstr; 561 562 // Get field data 563 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 564 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 565 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 566 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 567 } 568 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 569 if (basis != CEED_BASIS_NONE) { 570 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 571 if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 572 else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 573 } 574 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 575 576 // Basis 577 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 578 if (is_input) { 579 switch (eval_mode) { 580 case CEED_EVAL_NONE: 581 if (!use_3d_slices && !is_at_points) { 582 code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n"; 583 } 584 break; 585 case CEED_EVAL_INTERP: 586 if (is_at_points) { 587 std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 588 589 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 590 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 591 << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 592 } else { 593 std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 594 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 595 : "InterpNonTensor"; 596 std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 597 598 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 599 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e" 600 << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 601 } 602 break; 603 case CEED_EVAL_GRAD: 604 if (is_at_points) { 605 std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 606 607 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 608 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 609 << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 610 } else if (use_3d_slices) { 611 std::string function_name = 612 (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d"; 613 614 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 615 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 616 << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 617 } else if (is_tensor) { 618 bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; 619 std::string function_name = 620 (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + 621 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); 622 std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 623 624 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" 625 << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n"; 626 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e" 627 << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 628 } else { 629 std::string function_name = "GradNonTensor"; 630 631 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 632 code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name 633 << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 634 } 635 break; 636 case CEED_EVAL_WEIGHT: { 637 if (is_at_points) { 638 code << tab << "// Nothing to do AtPoints\n"; 639 } else { 640 CeedBasis_Hip_shared *basis_data; 641 std::string function_name = is_tensor 642 ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 643 : "WeightNonTensor"; 644 645 code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 646 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 647 data->W = basis_data->d_q_weight_1d; 648 code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n"; 649 } 650 break; 651 } 652 // LCOV_EXCL_START 653 case CEED_EVAL_DIV: 654 case CEED_EVAL_CURL: 655 break; // TODO: Not implemented 656 // LCOV_EXCL_STOP 657 } 658 } else { 659 switch (eval_mode) { 660 case CEED_EVAL_NONE: 661 code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n"; 662 break; // No action 663 case CEED_EVAL_INTERP: 664 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 665 if (is_at_points) { 666 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 667 668 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix 669 << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 670 } else { 671 std::string function_name = 672 is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 673 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 674 : "InterpTransposeNonTensor"; 675 std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 676 677 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q" 678 << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 679 } 680 break; 681 case CEED_EVAL_GRAD: 682 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 683 if (is_at_points) { 684 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 685 686 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix 687 << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 688 } else if (use_3d_slices) { 689 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 690 std::to_string(dim) + "d"; 691 692 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix 693 << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 694 } else if (is_tensor) { 695 bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; 696 std::string function_name = 697 (dim == 1 ? "GradTranspose" 698 : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + 699 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); 700 std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 701 702 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q" 703 << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 704 } else { 705 std::string function_name = "GradTransposeNonTensor"; 706 707 code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name 708 << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 709 } 710 break; 711 // LCOV_EXCL_START 712 case CEED_EVAL_WEIGHT: 713 break; // Should not occur 714 case CEED_EVAL_DIV: 715 case CEED_EVAL_CURL: 716 break; // TODO: Not implemented 717 // LCOV_EXCL_STOP 718 } 719 } 720 CeedCallBackend(CeedBasisDestroy(&basis)); 721 return CEED_ERROR_SUCCESS; 722 } 723 724 //------------------------------------------------------------------------------ 725 // QFunction 726 //------------------------------------------------------------------------------ 727 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim, 728 CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 729 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, 730 CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields, 731 std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points, 732 bool use_3d_slices, bool is_assemble) { 733 std::string Q_name = is_all_tensor ? "Q_1d" : "Q"; 734 CeedEvalMode eval_mode = CEED_EVAL_NONE; 735 CeedElemRestriction elem_rstr; 736 737 // Setup output arrays 738 code << "\n"; 739 code << tab << "// -- Output field setup\n"; 740 for (CeedInt i = 0; i < num_output_fields; i++) { 741 const char *field_name; 742 std::string var_suffix = "_out_" + std::to_string(i); 743 744 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 745 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 746 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 747 switch (eval_mode) { 748 case CEED_EVAL_NONE: 749 if (is_at_points) { 750 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n"; 751 } else { 752 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") 753 << "];\n"; 754 } 755 break; 756 case CEED_EVAL_INTERP: 757 if (is_at_points) { 758 // Accumulator for point data 759 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; 760 code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix 761 << "[i] = 0.0;\n"; 762 } else { 763 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") 764 << "];\n"; 765 } 766 break; 767 case CEED_EVAL_GRAD: 768 if (is_at_points) { 769 // Accumulator for point data 770 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; 771 code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix 772 << "[i] = 0.0;\n"; 773 } else if (use_3d_slices) { 774 // Accumulator for gradient slices 775 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 776 code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n"; 777 } else { 778 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" 779 << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n"; 780 } 781 break; 782 case CEED_EVAL_WEIGHT: 783 break; 784 // LCOV_EXCL_START 785 case CEED_EVAL_DIV: 786 case CEED_EVAL_CURL: 787 break; // TODO: Not implemented 788 // LCOV_EXCL_STOP 789 } 790 } 791 792 if (is_at_points) { 793 // We need to handle batches of points 794 code << "\n"; 795 code << tab << "// Note: Using batches of points\n"; 796 code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n"; 797 code << tab << "#pragma unroll\n"; 798 code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n"; 799 tab.push(); 800 code << tab << "const CeedInt p = i % max_num_points;\n\n"; 801 802 code << tab << "// -- Coordinates\n"; 803 code << tab << "CeedScalar r_x[max_dim];\n"; 804 code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n"; 805 806 code << tab << "// -- Input fields\n"; 807 for (CeedInt i = 0; i < num_input_fields; i++) { 808 const char *field_name; 809 std::string var_suffix = "_in_" + std::to_string(i); 810 std::string P_name = "P_1d" + var_suffix; 811 812 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 813 code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 814 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 815 // Basis action 816 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 817 switch (eval_mode) { 818 case CEED_EVAL_NONE: 819 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 820 code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 821 << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 822 break; 823 case CEED_EVAL_INTERP: 824 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 825 code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 826 << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; 827 break; 828 case CEED_EVAL_GRAD: 829 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 830 code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 831 << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; 832 break; 833 case CEED_EVAL_WEIGHT: 834 code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; 835 code << tab << "r_s" << var_suffix << "[0] = 1.0;\n"; 836 break; 837 // LCOV_EXCL_START 838 case CEED_EVAL_DIV: 839 case CEED_EVAL_CURL: 840 break; // TODO: Not implemented 841 // LCOV_EXCL_STOP 842 } 843 } 844 code << "\n"; 845 code << tab << "// -- Output fields\n"; 846 for (CeedInt i = 0; i < num_output_fields; i++) { 847 const char *field_name; 848 std::string var_suffix = "_out_" + std::to_string(i); 849 850 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 851 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 852 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 853 // Basis action 854 switch (eval_mode) { 855 case CEED_EVAL_NONE: 856 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 857 break; 858 case CEED_EVAL_INTERP: 859 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 860 break; 861 case CEED_EVAL_GRAD: 862 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 863 break; 864 // LCOV_EXCL_START 865 case CEED_EVAL_WEIGHT: 866 break; // Should not occur 867 case CEED_EVAL_DIV: 868 case CEED_EVAL_CURL: 869 break; // TODO: Not implemented 870 // LCOV_EXCL_STOP 871 } 872 } 873 874 } else if (use_3d_slices) { 875 // We treat quadrature points per slice in 3d to save registers 876 code << "\n"; 877 code << tab << "// Note: Using planes of 3D elements\n"; 878 code << tab << "#pragma unroll\n"; 879 code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n"; 880 tab.push(); 881 code << tab << "// -- Input fields\n"; 882 for (CeedInt i = 0; i < num_input_fields; i++) { 883 const char *field_name; 884 std::string var_suffix = "_in_" + std::to_string(i); 885 886 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 887 code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 888 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 889 // Basis action 890 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 891 switch (eval_mode) { 892 case CEED_EVAL_NONE: 893 bool is_strided; 894 895 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 896 897 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 898 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 899 if (is_strided) { 900 bool has_backend_strides; 901 CeedInt num_elem, elem_size; 902 903 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 904 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 905 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 906 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 907 908 if (!has_backend_strides) { 909 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 910 } 911 code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 912 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 913 code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides" 914 << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n"; 915 } else { 916 CeedSize l_size = 0; 917 CeedInt comp_stride; 918 CeedElemRestriction_Hip *rstr_data; 919 920 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 921 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 922 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 923 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 924 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 925 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 926 code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size" 927 << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 928 } 929 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 930 break; 931 case CEED_EVAL_INTERP: 932 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 933 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n"; 934 tab.push(); 935 code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n"; 936 tab.pop(); 937 code << tab << "}\n"; 938 break; 939 case CEED_EVAL_GRAD: 940 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 941 code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G" 942 << var_suffix << ", r_s" << var_suffix << ");\n"; 943 break; 944 case CEED_EVAL_WEIGHT: 945 code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; 946 code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n"; 947 break; 948 // LCOV_EXCL_START 949 case CEED_EVAL_DIV: 950 case CEED_EVAL_CURL: 951 break; // TODO: Not implemented 952 // LCOV_EXCL_STOP 953 } 954 } 955 code << "\n"; 956 code << tab << "// -- Output fields\n"; 957 for (CeedInt i = 0; i < num_output_fields; i++) { 958 const char *field_name; 959 std::string var_suffix = "_out_" + std::to_string(i); 960 961 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 962 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 963 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 964 // Basis action 965 switch (eval_mode) { 966 case CEED_EVAL_NONE: 967 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 968 break; 969 case CEED_EVAL_INTERP: 970 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 971 break; 972 case CEED_EVAL_GRAD: 973 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 974 break; 975 // LCOV_EXCL_START 976 case CEED_EVAL_WEIGHT: 977 break; // Should not occur 978 case CEED_EVAL_DIV: 979 case CEED_EVAL_CURL: 980 break; // TODO: Not implemented 981 // LCOV_EXCL_STOP 982 } 983 } 984 } else { 985 code << "\n"; 986 code << tab << "// Note: Using full elements\n"; 987 code << tab << "{\n"; 988 tab.push(); 989 code << tab << "// -- Input fields\n"; 990 for (CeedInt i = 0; i < num_input_fields; i++) { 991 const char *field_name; 992 993 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 994 code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 995 code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n"; 996 } 997 code << tab << "// -- Output fields\n"; 998 for (CeedInt i = 0; i < num_output_fields; i++) { 999 const char *field_name; 1000 1001 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1002 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1003 code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n"; 1004 } 1005 } 1006 1007 // Input and output buffers 1008 code << "\n"; 1009 code << tab << "// -- QFunction inputs and outputs\n"; 1010 code << tab << "// ---- Inputs\n"; 1011 code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; 1012 for (CeedInt i = 0; i < num_input_fields; i++) { 1013 const char *field_name; 1014 1015 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 1016 code << tab << "// ------ Input field " << i << ": " << field_name << "\n"; 1017 code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n"; 1018 } 1019 code << tab << "// ---- Outputs\n"; 1020 code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; 1021 for (CeedInt i = 0; i < num_output_fields; i++) { 1022 const char *field_name; 1023 1024 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1025 code << tab << "// ------ Output field " << i << ": " << field_name << "\n"; 1026 code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n"; 1027 } 1028 1029 // Apply QFunction 1030 code << "\n"; 1031 code << tab << "// -- Apply QFunction\n"; 1032 code << tab << "" << qfunction_name << "(ctx, "; 1033 if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 1034 code << "1"; 1035 } else { 1036 code << Q_name; 1037 } 1038 code << ", inputs, outputs);\n"; 1039 1040 if (is_at_points) { 1041 // Map back to coefficients 1042 code << "\n"; 1043 code << tab << "// -- Output fields\n"; 1044 for (CeedInt i = 0; i < num_output_fields; i++) { 1045 const char *field_name; 1046 std::string var_suffix = "_out_" + std::to_string(i); 1047 std::string P_name = "P_1d" + var_suffix; 1048 1049 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1050 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1051 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1052 // Basis action 1053 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 1054 switch (eval_mode) { 1055 case CEED_EVAL_NONE: { 1056 CeedInt comp_stride; 1057 CeedElemRestriction elem_rstr; 1058 1059 if (is_assemble) break; 1060 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1061 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 1062 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1063 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 1064 code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 1065 << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]" 1066 << ", r_s" << var_suffix << ", d" << var_suffix << ");\n"; 1067 break; 1068 } 1069 case CEED_EVAL_INTERP: 1070 code << tab << "if (i >= points.num_per_elem[elem]) {\n"; 1071 tab.push(); 1072 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 1073 tab.pop(); 1074 code << tab << "}\n"; 1075 code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 1076 << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 1077 break; 1078 case CEED_EVAL_GRAD: 1079 code << tab << "if (i >= points.num_per_elem[elem]) {\n"; 1080 tab.push(); 1081 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 1082 tab.pop(); 1083 code << tab << "}\n"; 1084 code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 1085 << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 1086 break; 1087 // LCOV_EXCL_START 1088 case CEED_EVAL_WEIGHT: 1089 break; // Should not occur 1090 case CEED_EVAL_DIV: 1091 case CEED_EVAL_CURL: 1092 break; // TODO: Not implemented 1093 // LCOV_EXCL_STOP 1094 } 1095 } 1096 } else if (use_3d_slices) { 1097 // Copy or apply transpose grad, if needed 1098 code << "\n"; 1099 code << tab << "// -- Output fields\n"; 1100 for (CeedInt i = 0; i < num_output_fields; i++) { 1101 const char *field_name; 1102 std::string var_suffix = "_out_" + std::to_string(i); 1103 std::string P_name = "P_1d" + var_suffix; 1104 1105 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1106 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1107 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1108 // Basis action 1109 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 1110 switch (eval_mode) { 1111 case CEED_EVAL_NONE: 1112 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 1113 tab.push(); 1114 code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 1115 tab.pop(); 1116 code << tab << "}\n"; 1117 break; 1118 case CEED_EVAL_INTERP: 1119 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 1120 tab.push(); 1121 code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 1122 tab.pop(); 1123 code << tab << "}\n"; 1124 break; 1125 case CEED_EVAL_GRAD: 1126 code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G" 1127 << var_suffix << ", r_q" << var_suffix << ");\n"; 1128 break; 1129 // LCOV_EXCL_START 1130 case CEED_EVAL_WEIGHT: 1131 break; // Should not occur 1132 case CEED_EVAL_DIV: 1133 case CEED_EVAL_CURL: 1134 break; // TODO: Not implemented 1135 // LCOV_EXCL_STOP 1136 } 1137 } 1138 } 1139 tab.pop(); 1140 code << tab << "}\n"; 1141 return CEED_ERROR_SUCCESS; 1142 } 1143 1144 //------------------------------------------------------------------------------ 1145 // Build single operator kernel 1146 //------------------------------------------------------------------------------ 1147 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) { 1148 bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; 1149 Ceed ceed; 1150 CeedInt Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; 1151 CeedQFunctionField *qf_input_fields, *qf_output_fields; 1152 CeedQFunction_Hip_gen *qf_data; 1153 CeedQFunction qf; 1154 CeedOperatorField *op_input_fields, *op_output_fields; 1155 CeedOperator_Hip_gen *data; 1156 std::ostringstream code; 1157 Tab tab; 1158 1159 CeedCallBackend(CeedOperatorGetData(op, &data)); 1160 { 1161 bool is_setup_done; 1162 1163 CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 1164 if (is_setup_done) { 1165 *is_good_build = !data->use_fallback; 1166 return CEED_ERROR_SUCCESS; 1167 } 1168 } 1169 1170 // Check field compatibility 1171 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 1172 { 1173 bool has_shared_bases = true; 1174 1175 for (CeedInt i = 0; i < num_input_fields; i++) { 1176 CeedBasis basis; 1177 1178 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 1179 if (basis != CEED_BASIS_NONE) { 1180 bool is_tensor = true; 1181 const char *resource; 1182 char *resource_root; 1183 Ceed basis_ceed; 1184 1185 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 1186 is_all_tensor = is_all_tensor && is_tensor; 1187 is_all_nontensor = is_all_nontensor && !is_tensor; 1188 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 1189 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 1190 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 1191 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 1192 CeedCallBackend(CeedFree(&resource_root)); 1193 CeedCallBackend(CeedDestroy(&basis_ceed)); 1194 } 1195 CeedCallBackend(CeedBasisDestroy(&basis)); 1196 } 1197 1198 for (CeedInt i = 0; i < num_output_fields; i++) { 1199 CeedBasis basis; 1200 1201 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 1202 if (basis != CEED_BASIS_NONE) { 1203 bool is_tensor = true; 1204 const char *resource; 1205 char *resource_root; 1206 Ceed basis_ceed; 1207 1208 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 1209 is_all_tensor = is_all_tensor && is_tensor; 1210 is_all_nontensor = is_all_nontensor && !is_tensor; 1211 1212 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 1213 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 1214 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 1215 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 1216 CeedCallBackend(CeedFree(&resource_root)); 1217 CeedCallBackend(CeedDestroy(&basis_ceed)); 1218 } 1219 CeedCallBackend(CeedBasisDestroy(&basis)); 1220 } 1221 // -- Fallback to ref if not all bases are shared 1222 if (!has_shared_bases) { 1223 *is_good_build = false; 1224 return CEED_ERROR_SUCCESS; 1225 } 1226 } 1227 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1228 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1229 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 1230 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 1231 1232 // Get operator data 1233 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 1234 { 1235 CeedInt max_P = 0, max_P_1d = 0; 1236 1237 CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields, 1238 qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices)); 1239 data->max_P_1d = is_all_tensor ? max_P_1d : max_P; 1240 } 1241 if (is_at_points) { 1242 CeedInt coords_dim = 0; 1243 CeedElemRestriction_Hip *rstr_data; 1244 CeedElemRestriction rstr_points = NULL; 1245 1246 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 1247 CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 1248 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 1249 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim)); 1250 CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data)); 1251 data->points.indices = (CeedInt *)rstr_data->d_offsets; 1252 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 1253 if (max_dim == 0) max_dim = coords_dim; 1254 if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim)); 1255 } 1256 if (max_dim == 0) max_dim = 1; 1257 data->dim = max_dim; 1258 if (is_at_points) use_3d_slices = false; 1259 if (Q_1d == 0) { 1260 if (is_at_points) Q_1d = max_num_points; 1261 else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d)); 1262 } 1263 if (Q == 0) Q = Q_1d; 1264 data->Q = Q; 1265 data->Q_1d = Q_1d; 1266 1267 // Check for restriction only identity operator 1268 { 1269 bool is_identity_qf; 1270 1271 CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 1272 if (is_identity_qf) { 1273 CeedEvalMode eval_mode_in, eval_mode_out; 1274 1275 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); 1276 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); 1277 CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, 1278 "Backend does not implement restriction only identity operators"); 1279 } 1280 } 1281 1282 // Load basis source files 1283 if (!is_all_nontensor) { 1284 code << tab << "// Tensor basis source\n"; 1285 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 1286 } 1287 if (!is_all_tensor) { 1288 code << tab << "// Non-tensor basis source\n"; 1289 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n"; 1290 } 1291 if (is_at_points) { 1292 code << tab << "// AtPoints basis source\n"; 1293 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n"; 1294 } 1295 if (!is_all_tensor && !is_all_nontensor) { 1296 code << tab << "// Tensor basis source\n"; 1297 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n"; 1298 } 1299 code << tab << "// CodeGen operator source\n"; 1300 code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 1301 1302 // Get QFunction name 1303 std::string qfunction_name(qf_data->qfunction_name); 1304 std::string operator_name; 1305 1306 operator_name = "CeedKernelHipGenOperator_" + qfunction_name; 1307 1308 // Define CEED_Q_VLA 1309 code << "\n" << tab << "#undef CEED_Q_VLA\n"; 1310 if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 1311 code << tab << "#define CEED_Q_VLA 1\n\n"; 1312 } else { 1313 code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 1314 } 1315 1316 // Add user QFunction source 1317 { 1318 const char *source_path; 1319 1320 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 1321 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 1322 1323 code << tab << "// User QFunction source\n"; 1324 code << tab << "#include \"" << source_path << "\"\n\n"; 1325 } 1326 1327 // Setup 1328 code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 1329 code << tab << "// Operator Kernel\n"; 1330 code << tab << "// \n"; 1331 code << tab << "// d_[in,out]_i: CeedVector device array\n"; 1332 code << tab << "// r_[in,out]_e_i: Element vector register\n"; 1333 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 1334 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 1335 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 1336 code << tab << "// \n"; 1337 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 1338 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 1339 code << tab << "// -----------------------------------------------------------------------------\n"; 1340 code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; 1341 code << "__global__ void " << operator_name 1342 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n"; 1343 tab.push(); 1344 1345 // Scratch buffers 1346 for (CeedInt i = 0; i < num_input_fields; i++) { 1347 CeedEvalMode eval_mode; 1348 1349 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1350 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 1351 code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 1352 } 1353 } 1354 for (CeedInt i = 0; i < num_output_fields; i++) { 1355 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 1356 } 1357 1358 code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 1359 if (!is_all_tensor) { 1360 code << tab << "const CeedInt Q = " << Q << ";\n"; 1361 } 1362 if (!is_all_nontensor) { 1363 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 1364 } 1365 if (is_at_points) { 1366 code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; 1367 code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 1368 } 1369 1370 // Shared data 1371 code << tab << "extern __shared__ CeedScalar slice[];\n"; 1372 code << tab << "SharedData_Hip data;\n"; 1373 code << tab << "data.t_id_x = threadIdx.x;\n"; 1374 code << tab << "data.t_id_y = threadIdx.y;\n"; 1375 code << tab << "data.t_id_z = threadIdx.z;\n"; 1376 code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 1377 code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 1378 1379 // -- Determine input mat reuse 1380 FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 1381 1382 for (CeedInt i = 0; i < num_input_fields; i++) { 1383 input_matrix_reuse[i].index = -1; 1384 } 1385 for (CeedInt i = 0; i < num_input_fields; i++) { 1386 bool is_tensor = true; 1387 CeedEvalMode eval_mode_i; 1388 CeedBasis basis_i; 1389 1390 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 1391 if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 1392 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 1393 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 1394 for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 1395 CeedEvalMode eval_mode_j; 1396 CeedBasis basis_j; 1397 1398 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1399 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1400 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1401 if (basis_i == basis_j) { 1402 if (is_tensor) { 1403 input_matrix_reuse[i].index = j; 1404 input_matrix_reuse[i].is_input = true; 1405 input_matrix_reuse[i].eval_mode = eval_mode_j; 1406 } else { 1407 // For non-tensor can only re-use with the same eval mode 1408 if (eval_mode_i == eval_mode_j) { 1409 input_matrix_reuse[i].index = j; 1410 input_matrix_reuse[i].is_input = true; 1411 input_matrix_reuse[i].eval_mode = eval_mode_j; 1412 } 1413 } 1414 } 1415 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1416 } 1417 CeedCallBackend(CeedBasisDestroy(&basis_i)); 1418 } 1419 1420 // -- Determine output mat reuse 1421 FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 1422 1423 for (CeedInt i = 0; i < num_output_fields; i++) { 1424 output_matrix_reuse[i].index = -1; 1425 } 1426 for (CeedInt i = 0; i < num_output_fields; i++) { 1427 bool is_tensor = true; 1428 CeedEvalMode eval_mode_i; 1429 CeedBasis basis_i; 1430 1431 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 1432 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 1433 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 1434 CeedEvalMode eval_mode_j; 1435 CeedBasis basis_j; 1436 1437 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1438 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1439 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1440 if (basis_i == basis_j) { 1441 if (is_tensor) { 1442 output_matrix_reuse[i].index = j; 1443 output_matrix_reuse[i].is_input = true; 1444 output_matrix_reuse[i].eval_mode = eval_mode_j; 1445 } else { 1446 // For non-tensor can only re-use with the same eval mode 1447 if (eval_mode_i == eval_mode_j) { 1448 output_matrix_reuse[i].index = j; 1449 output_matrix_reuse[i].is_input = true; 1450 output_matrix_reuse[i].eval_mode = eval_mode_j; 1451 } 1452 } 1453 } 1454 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1455 } 1456 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 1457 CeedEvalMode eval_mode_j; 1458 CeedBasis basis_j; 1459 1460 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 1461 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1462 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 1463 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 1464 if (basis_i == basis_j) { 1465 if (is_tensor) { 1466 output_matrix_reuse[i].index = j; 1467 output_matrix_reuse[i].is_input = false; 1468 output_matrix_reuse[i].eval_mode = eval_mode_j; 1469 } else { 1470 // For non-tensor can only re-use with the same eval mode 1471 if (eval_mode_i == eval_mode_j) { 1472 output_matrix_reuse[i].index = j; 1473 output_matrix_reuse[i].is_input = false; 1474 output_matrix_reuse[i].eval_mode = eval_mode_j; 1475 } 1476 } 1477 } 1478 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1479 } 1480 CeedCallBackend(CeedBasisDestroy(&basis_i)); 1481 } 1482 1483 // Initialize constants, and matrices B and G 1484 code << "\n" << tab << "// Input field constants and basis data\n"; 1485 for (CeedInt i = 0; i < num_input_fields; i++) { 1486 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 1487 max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); 1488 } 1489 code << "\n" << tab << "// Output field constants and basis data\n"; 1490 for (CeedInt i = 0; i < num_output_fields; i++) { 1491 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 1492 max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); 1493 } 1494 1495 // Loop over all elements 1496 code << "\n" << tab << "// Element loop\n"; 1497 code << tab << "__syncthreads();\n"; 1498 code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 1499 tab.push(); 1500 1501 // -- Compute minimum buffer space needed 1502 CeedInt max_rstr_buffer_size = 1; 1503 1504 for (CeedInt i = 0; i < num_input_fields; i++) { 1505 CeedEvalMode eval_mode; 1506 1507 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1508 if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 1509 CeedInt num_comp; 1510 CeedElemRestriction elem_rstr; 1511 1512 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 1513 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1514 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1515 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1516 } 1517 } 1518 for (CeedInt i = 0; i < num_output_fields; i++) { 1519 CeedEvalMode eval_mode; 1520 1521 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1522 if (eval_mode != CEED_EVAL_NONE) { 1523 CeedInt num_comp; 1524 CeedElemRestriction elem_rstr; 1525 1526 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1527 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1528 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1529 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1530 } 1531 } 1532 code << tab << "// Scratch restriction buffer space\n"; 1533 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 1534 1535 // -- Determine best input field processing order 1536 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 1537 1538 for (CeedInt i = 0; i < num_input_fields; i++) { 1539 field_rstr_in_buffer[i] = -1; 1540 input_field_order[i] = -1; 1541 } 1542 { 1543 bool is_ordered[CEED_FIELD_MAX]; 1544 CeedInt curr_index = 0; 1545 1546 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 1547 for (CeedInt i = 0; i < num_input_fields; i++) { 1548 CeedVector vec_i; 1549 CeedElemRestriction rstr_i; 1550 1551 if (is_ordered[i]) continue; 1552 field_rstr_in_buffer[i] = i; 1553 is_ordered[i] = true; 1554 input_field_order[curr_index] = i; 1555 curr_index++; 1556 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 1557 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 1558 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 1559 for (CeedInt j = i + 1; j < num_input_fields; j++) { 1560 CeedVector vec_j; 1561 CeedElemRestriction rstr_j; 1562 1563 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 1564 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 1565 if (rstr_i == rstr_j && vec_i == vec_j) { 1566 field_rstr_in_buffer[j] = i; 1567 is_ordered[j] = true; 1568 input_field_order[curr_index] = j; 1569 curr_index++; 1570 } 1571 CeedCallBackend(CeedVectorDestroy(&vec_j)); 1572 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 1573 } 1574 CeedCallBackend(CeedVectorDestroy(&vec_i)); 1575 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 1576 } 1577 } 1578 1579 // -- Input restriction and basis 1580 code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 1581 for (CeedInt i = 0; i < num_input_fields; i++) { 1582 const char *field_name; 1583 const CeedInt f = input_field_order[i]; 1584 1585 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 1586 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 1587 1588 // ---- Restriction 1589 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 1590 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 1591 1592 // ---- Basis action 1593 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 1594 is_all_tensor, is_at_points, use_3d_slices)); 1595 } 1596 1597 // -- Q function 1598 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 1599 qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 1600 Q_1d, is_all_tensor, is_at_points, use_3d_slices, false)); 1601 1602 // -- Output basis and restriction 1603 code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 1604 for (CeedInt i = 0; i < num_output_fields; i++) { 1605 const char *field_name; 1606 1607 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1608 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1609 1610 // ---- Basis action 1611 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, 1612 is_all_tensor, is_at_points, use_3d_slices)); 1613 1614 // ---- Restriction 1615 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, 1616 false, is_all_tensor, is_at_points, use_3d_slices)); 1617 } 1618 1619 // Close loop and function 1620 tab.pop(); 1621 code << tab << "}\n"; 1622 tab.pop(); 1623 code << tab << "}\n"; 1624 code << tab << "// -----------------------------------------------------------------------------\n\n"; 1625 1626 CeedInt block_sizes[3] = {0, 0, 0}; 1627 CeedInt num_elem; 1628 1629 // Compile 1630 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 1631 CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes)); 1632 { 1633 bool is_compile_good = false; 1634 1635 data->thread_1d = block_sizes[0]; 1636 CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE", 1637 block_sizes[0] * block_sizes[1] * block_sizes[2])); 1638 if (is_compile_good) { 1639 *is_good_build = true; 1640 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op)); 1641 } else { 1642 *is_good_build = false; 1643 data->use_fallback = true; 1644 } 1645 } 1646 CeedCallBackend(CeedOperatorSetSetupDone(op)); 1647 CeedCallBackend(CeedDestroy(&ceed)); 1648 CeedCallBackend(CeedQFunctionDestroy(&qf)); 1649 return CEED_ERROR_SUCCESS; 1650 } 1651 1652 //------------------------------------------------------------------------------ 1653 // Build AtPoints assembly operator kernel 1654 //------------------------------------------------------------------------------ 1655 static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) { 1656 bool is_all_tensor = true, is_at_points = false, use_3d_slices = false; 1657 Ceed ceed; 1658 CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; 1659 CeedQFunctionField *qf_input_fields, *qf_output_fields; 1660 CeedQFunction_Hip_gen *qf_data; 1661 CeedQFunction qf; 1662 CeedOperatorField *op_input_fields, *op_output_fields; 1663 CeedOperator_Hip_gen *data; 1664 std::ostringstream code; 1665 Tab tab; 1666 1667 // Check compatibility 1668 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1669 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 1670 CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported"); 1671 1672 // Retrieve operator data 1673 CeedCallBackend(CeedOperatorGetData(op, &data)); 1674 Q = data->Q; 1675 Q_1d = data->Q_1d; 1676 max_dim = data->dim; 1677 { 1678 CeedElemRestriction rstr_points = NULL; 1679 1680 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 1681 CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 1682 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 1683 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 1684 } 1685 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1686 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 1687 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 1688 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 1689 1690 // Load basis source files 1691 code << tab << "// Tensor basis source\n"; 1692 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 1693 code << tab << "// AtPoints basis source\n"; 1694 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n"; 1695 code << tab << "// CodeGen operator source\n"; 1696 code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 1697 1698 // Get QFunction name 1699 std::string qfunction_name(qf_data->qfunction_name); 1700 std::string operator_name; 1701 1702 if (is_full) { 1703 operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name; 1704 } else { 1705 operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name; 1706 } 1707 1708 // Define CEED_Q_VLA 1709 code << "\n" << tab << "#undef CEED_Q_VLA\n"; 1710 code << tab << "#define CEED_Q_VLA 1\n\n"; 1711 1712 // Add user QFunction source 1713 { 1714 const char *source_path; 1715 1716 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 1717 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 1718 1719 code << tab << "// User QFunction source\n"; 1720 code << tab << "#include \"" << source_path << "\"\n\n"; 1721 } 1722 1723 // Setup 1724 code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 1725 code << tab << "// Operator Assembly Kernel\n"; 1726 code << tab << "// \n"; 1727 code << tab << "// d_[in,out]_i: CeedVector device array\n"; 1728 code << tab << "// r_[in,out]_e_i: Element vector register\n"; 1729 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 1730 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 1731 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 1732 code << tab << "// \n"; 1733 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 1734 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 1735 code << tab << "// -----------------------------------------------------------------------------\n"; 1736 code << tab << "extern \"C\" __global__ void " << operator_name 1737 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip " 1738 "points, CeedScalar *__restrict__ values_array) {\n"; 1739 tab.push(); 1740 1741 // Scratch buffers 1742 for (CeedInt i = 0; i < num_input_fields; i++) { 1743 CeedEvalMode eval_mode; 1744 1745 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1746 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 1747 code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 1748 } 1749 } 1750 for (CeedInt i = 0; i < num_output_fields; i++) { 1751 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 1752 } 1753 1754 code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 1755 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 1756 code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; 1757 code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 1758 1759 // Shared data 1760 code << tab << "extern __shared__ CeedScalar slice[];\n"; 1761 code << tab << "SharedData_Hip data;\n"; 1762 code << tab << "data.t_id_x = threadIdx.x;\n"; 1763 code << tab << "data.t_id_y = threadIdx.y;\n"; 1764 code << tab << "data.t_id_z = threadIdx.z;\n"; 1765 code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 1766 code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 1767 1768 // -- Determine input mat reuse 1769 FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 1770 1771 for (CeedInt i = 0; i < num_input_fields; i++) { 1772 input_matrix_reuse[i].index = -1; 1773 } 1774 for (CeedInt i = 0; i < num_input_fields; i++) { 1775 CeedEvalMode eval_mode_i; 1776 CeedBasis basis_i; 1777 1778 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 1779 if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 1780 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 1781 for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 1782 CeedEvalMode eval_mode_j; 1783 CeedBasis basis_j; 1784 1785 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1786 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1787 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1788 if (basis_i == basis_j) { 1789 input_matrix_reuse[i].index = j; 1790 input_matrix_reuse[i].is_input = true; 1791 input_matrix_reuse[i].eval_mode = eval_mode_j; 1792 } 1793 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1794 } 1795 CeedCallBackend(CeedBasisDestroy(&basis_i)); 1796 } 1797 1798 // -- Determine output mat reuse 1799 FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 1800 1801 for (CeedInt i = 0; i < num_output_fields; i++) { 1802 output_matrix_reuse[i].index = -1; 1803 } 1804 for (CeedInt i = 0; i < num_output_fields; i++) { 1805 CeedEvalMode eval_mode_i; 1806 CeedBasis basis_i; 1807 1808 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 1809 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 1810 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 1811 CeedEvalMode eval_mode_j; 1812 CeedBasis basis_j; 1813 1814 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1815 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1816 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1817 if (basis_i == basis_j) { 1818 output_matrix_reuse[i].index = j; 1819 output_matrix_reuse[i].is_input = true; 1820 output_matrix_reuse[i].eval_mode = eval_mode_j; 1821 } 1822 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1823 } 1824 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 1825 CeedEvalMode eval_mode_j; 1826 CeedBasis basis_j; 1827 1828 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 1829 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1830 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 1831 if (basis_i == basis_j) { 1832 output_matrix_reuse[i].index = j; 1833 output_matrix_reuse[i].is_input = false; 1834 output_matrix_reuse[i].eval_mode = eval_mode_j; 1835 } 1836 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1837 } 1838 CeedCallBackend(CeedBasisDestroy(&basis_i)); 1839 } 1840 1841 // Initialize constants, and matrices B and G 1842 code << "\n" << tab << "// Input field constants and basis data\n"; 1843 for (CeedInt i = 0; i < num_input_fields; i++) { 1844 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 1845 max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); 1846 } 1847 code << "\n" << tab << "// Output field constants and basis data\n"; 1848 for (CeedInt i = 0; i < num_output_fields; i++) { 1849 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 1850 max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); 1851 } 1852 1853 // Loop over all elements 1854 code << "\n" << tab << "// Element loop\n"; 1855 code << tab << "__syncthreads();\n"; 1856 code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 1857 tab.push(); 1858 1859 // -- Compute minimum buffer space needed 1860 CeedInt max_rstr_buffer_size = 1; 1861 1862 for (CeedInt i = 0; i < num_input_fields; i++) { 1863 CeedEvalMode eval_mode; 1864 1865 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1866 if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 1867 CeedInt num_comp; 1868 CeedElemRestriction elem_rstr; 1869 1870 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 1871 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1872 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1873 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1874 } 1875 } 1876 for (CeedInt i = 0; i < num_output_fields; i++) { 1877 CeedEvalMode eval_mode; 1878 1879 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1880 if (eval_mode != CEED_EVAL_NONE) { 1881 CeedInt num_comp; 1882 CeedElemRestriction elem_rstr; 1883 1884 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1885 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1886 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1887 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1888 } 1889 } 1890 code << tab << "// Scratch restriction buffer space\n"; 1891 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 1892 1893 // -- Determine best input field processing order 1894 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 1895 1896 for (CeedInt i = 0; i < num_input_fields; i++) { 1897 field_rstr_in_buffer[i] = -1; 1898 input_field_order[i] = -1; 1899 } 1900 { 1901 bool is_ordered[CEED_FIELD_MAX]; 1902 CeedInt curr_index = 0; 1903 1904 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 1905 for (CeedInt i = 0; i < num_input_fields; i++) { 1906 CeedVector vec_i; 1907 CeedElemRestriction rstr_i; 1908 1909 if (is_ordered[i]) continue; 1910 field_rstr_in_buffer[i] = i; 1911 is_ordered[i] = true; 1912 input_field_order[curr_index] = i; 1913 curr_index++; 1914 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 1915 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 1916 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 1917 for (CeedInt j = i + 1; j < num_input_fields; j++) { 1918 CeedVector vec_j; 1919 CeedElemRestriction rstr_j; 1920 1921 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 1922 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 1923 if (rstr_i == rstr_j && vec_i == vec_j) { 1924 field_rstr_in_buffer[j] = i; 1925 is_ordered[j] = true; 1926 input_field_order[curr_index] = j; 1927 curr_index++; 1928 } 1929 CeedCallBackend(CeedVectorDestroy(&vec_j)); 1930 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 1931 } 1932 CeedCallBackend(CeedVectorDestroy(&vec_i)); 1933 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 1934 } 1935 } 1936 1937 // -- Input restriction and basis 1938 code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 1939 CeedInt active_field_index = -1; 1940 1941 for (CeedInt i = 0; i < num_input_fields; i++) { 1942 bool is_active = false; 1943 const char *field_name; 1944 const CeedInt f = input_field_order[i]; 1945 1946 { 1947 CeedVector vec; 1948 1949 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 1950 is_active = vec == CEED_VECTOR_ACTIVE; 1951 CeedCallBackend(CeedVectorDestroy(&vec)); 1952 } 1953 1954 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 1955 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 1956 1957 if (is_active) { 1958 std::string var_suffix = "_in_" + std::to_string(f); 1959 1960 code << tab << "// Active field - no restriction or basis action here\n"; 1961 if (active_field_index == -1) { 1962 active_field_index = f; 1963 code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1") 1964 << "] = {0.0};\n"; 1965 } else { 1966 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n"; 1967 } 1968 } else { 1969 // ---- Restriction 1970 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 1971 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 1972 1973 // ---- Basis action 1974 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 1975 is_all_tensor, is_at_points, use_3d_slices)); 1976 } 1977 } 1978 1979 // -- Loop over active field 1980 std::string active_var_suffix = "_in_" + std::to_string(active_field_index); 1981 1982 code << "\n" << tab << "// Loop over nodes in active field\n"; 1983 code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix 1984 << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n"; 1985 tab.push(); 1986 1987 // -- Set current active node and component to 1 1988 code << tab << "// Set current active node and component to 1.0\n"; 1989 code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e" 1990 << active_var_suffix << ");\n\n"; 1991 1992 for (CeedInt i = 0; i < num_input_fields; i++) { 1993 bool is_active = false; 1994 const char *field_name; 1995 const CeedInt f = input_field_order[i]; 1996 1997 { 1998 CeedVector vec; 1999 2000 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 2001 is_active = vec == CEED_VECTOR_ACTIVE; 2002 CeedCallBackend(CeedVectorDestroy(&vec)); 2003 } 2004 if (!is_active) continue; 2005 2006 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 2007 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 2008 2009 // ---- Basis action 2010 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 2011 is_all_tensor, is_at_points, use_3d_slices)); 2012 } 2013 2014 // -- Q function 2015 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 2016 qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 2017 Q_1d, is_all_tensor, is_at_points, use_3d_slices, true)); 2018 2019 // -- Output basis and restriction 2020 code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 2021 for (CeedInt i = 0; i < num_output_fields; i++) { 2022 bool is_active = false; 2023 const char *field_name; 2024 2025 { 2026 CeedVector vec; 2027 2028 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 2029 is_active = vec == CEED_VECTOR_ACTIVE; 2030 CeedCallBackend(CeedVectorDestroy(&vec)); 2031 } 2032 if (!is_active) continue; 2033 2034 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 2035 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 2036 2037 // ---- Basis action 2038 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, 2039 is_all_tensor, is_at_points, use_3d_slices)); 2040 2041 // ---- Restriction 2042 if (is_full) { 2043 std::string var_suffix = "_out_" + std::to_string(i); 2044 CeedInt comp_stride; 2045 CeedSize l_size; 2046 CeedElemRestriction elem_rstr; 2047 2048 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 2049 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 2050 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 2051 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 2052 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 2053 code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix 2054 << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n"; 2055 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2056 } else { 2057 std::string var_suffix = "_out_" + std::to_string(i); 2058 CeedInt comp_stride; 2059 CeedSize l_size; 2060 CeedElemRestriction elem_rstr; 2061 2062 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 2063 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 2064 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 2065 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 2066 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 2067 code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix 2068 << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n"; 2069 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2070 } 2071 } 2072 2073 // -- Reset current active node and component 2074 code << "\n" << tab << "// Reset current active node and component to 0.0\n"; 2075 code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e" 2076 << active_var_suffix << ");\n"; 2077 2078 // -- End of loop over active field 2079 tab.pop(); 2080 code << tab << "}\n"; 2081 2082 // Close loop and function 2083 tab.pop(); 2084 code << tab << "}\n"; 2085 tab.pop(); 2086 code << tab << "}\n"; 2087 code << tab << "// -----------------------------------------------------------------------------\n\n"; 2088 2089 CeedInt block_sizes[3] = {0, 0, 0}; 2090 CeedInt num_elem; 2091 2092 // Compile 2093 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 2094 CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); 2095 { 2096 bool is_compile_good = false; 2097 2098 data->thread_1d = block_sizes[0]; 2099 CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, 2100 is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0], 2101 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2])); 2102 if (is_compile_good) { 2103 *is_good_build = true; 2104 CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(), 2105 is_full ? &data->assemble_full : &data->assemble_diagonal)); 2106 } else { 2107 *is_good_build = false; 2108 data->use_assembly_fallback = true; 2109 } 2110 } 2111 CeedCallBackend(CeedDestroy(&ceed)); 2112 CeedCallBackend(CeedQFunctionDestroy(&qf)); 2113 return CEED_ERROR_SUCCESS; 2114 } 2115 2116 extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) { 2117 return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build); 2118 } 2119 2120 extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) { 2121 return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build); 2122 } 2123 //------------------------------------------------------------------------------ 2124 // Build QFunction assembly operator kernel 2125 //------------------------------------------------------------------------------ 2126 extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) { 2127 bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; 2128 Ceed ceed; 2129 CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0; 2130 CeedQFunctionField *qf_input_fields, *qf_output_fields; 2131 CeedQFunction_Hip_gen *qf_data; 2132 CeedQFunction qf; 2133 CeedOperatorField *op_input_fields, *op_output_fields; 2134 CeedOperator_Hip_gen *data; 2135 std::ostringstream code; 2136 Tab tab; 2137 2138 // Check compatibility 2139 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 2140 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 2141 CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported"); 2142 2143 // Check field compatibility 2144 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 2145 { 2146 bool has_shared_bases = true; 2147 2148 for (CeedInt i = 0; i < num_input_fields; i++) { 2149 CeedBasis basis; 2150 2151 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 2152 if (basis != CEED_BASIS_NONE) { 2153 bool is_tensor = true; 2154 const char *resource; 2155 char *resource_root; 2156 Ceed basis_ceed; 2157 2158 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 2159 is_all_tensor = is_all_tensor && is_tensor; 2160 is_all_nontensor = is_all_nontensor && !is_tensor; 2161 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 2162 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 2163 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 2164 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 2165 CeedCallBackend(CeedFree(&resource_root)); 2166 CeedCallBackend(CeedDestroy(&basis_ceed)); 2167 } 2168 CeedCallBackend(CeedBasisDestroy(&basis)); 2169 } 2170 2171 for (CeedInt i = 0; i < num_output_fields; i++) { 2172 CeedBasis basis; 2173 2174 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 2175 if (basis != CEED_BASIS_NONE) { 2176 bool is_tensor = true; 2177 const char *resource; 2178 char *resource_root; 2179 Ceed basis_ceed; 2180 2181 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 2182 is_all_tensor = is_all_tensor && is_tensor; 2183 is_all_nontensor = is_all_nontensor && !is_tensor; 2184 2185 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 2186 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 2187 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 2188 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 2189 CeedCallBackend(CeedFree(&resource_root)); 2190 CeedCallBackend(CeedDestroy(&basis_ceed)); 2191 } 2192 CeedCallBackend(CeedBasisDestroy(&basis)); 2193 } 2194 } 2195 2196 // Retrieve operator data 2197 CeedCallBackend(CeedOperatorGetData(op, &data)); 2198 Q = data->Q; 2199 Q_1d = data->Q_1d; 2200 max_dim = data->dim; 2201 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 2202 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 2203 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 2204 2205 // Load basis source files 2206 if (!is_all_nontensor) { 2207 code << tab << "// Tensor basis source\n"; 2208 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 2209 } 2210 if (!is_all_tensor) { 2211 code << tab << "// Non-tensor basis source\n"; 2212 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n"; 2213 } 2214 if (!is_all_tensor && !is_all_nontensor) { 2215 code << "// Tensor basis source\n"; 2216 code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n"; 2217 } 2218 code << "// CodeGen operator source\n"; 2219 code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 2220 2221 // Get QFunction name 2222 std::string qfunction_name(qf_data->qfunction_name); 2223 std::string operator_name; 2224 2225 operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name; 2226 2227 // Define CEED_Q_VLA 2228 code << "\n" << tab << "#undef CEED_Q_VLA\n"; 2229 if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 2230 code << tab << "#define CEED_Q_VLA 1\n\n"; 2231 } else { 2232 code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 2233 } 2234 2235 // Add user QFunction source 2236 { 2237 const char *source_path; 2238 2239 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 2240 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 2241 2242 code << tab << "// User QFunction source\n"; 2243 code << tab << "#include \"" << source_path << "\"\n\n"; 2244 } 2245 2246 // Setup 2247 code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 2248 code << tab << "// Operator Assembly Kernel\n"; 2249 code << tab << "// \n"; 2250 code << tab << "// d_[in,out]_i: CeedVector device array\n"; 2251 code << tab << "// r_[in,out]_e_i: Element vector register\n"; 2252 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 2253 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 2254 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 2255 code << tab << "// \n"; 2256 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 2257 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 2258 code << tab << "// -----------------------------------------------------------------------------\n"; 2259 code << tab << "extern \"C\" __global__ void " << operator_name 2260 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip " 2261 "points, CeedScalar *__restrict__ values_array) {\n"; 2262 tab.push(); 2263 2264 // Scratch buffers 2265 for (CeedInt i = 0; i < num_input_fields; i++) { 2266 CeedEvalMode eval_mode; 2267 2268 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 2269 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 2270 code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 2271 } 2272 } 2273 for (CeedInt i = 0; i < num_output_fields; i++) { 2274 bool is_active = false; 2275 2276 { 2277 CeedVector vec; 2278 2279 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 2280 is_active = vec == CEED_VECTOR_ACTIVE; 2281 CeedCallBackend(CeedVectorDestroy(&vec)); 2282 } 2283 if (is_active) { 2284 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 2285 } 2286 } 2287 2288 code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 2289 if (!is_all_tensor) { 2290 code << tab << "const CeedInt Q = " << Q << ";\n"; 2291 } 2292 if (!is_all_nontensor) { 2293 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 2294 } 2295 2296 // Shared data 2297 code << tab << "extern __shared__ CeedScalar slice[];\n"; 2298 code << tab << "SharedData_Hip data;\n"; 2299 code << tab << "data.t_id_x = threadIdx.x;\n"; 2300 code << tab << "data.t_id_y = threadIdx.y;\n"; 2301 code << tab << "data.t_id_z = threadIdx.z;\n"; 2302 code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 2303 code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 2304 2305 // -- Determine input mat reuse 2306 FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 2307 2308 for (CeedInt i = 0; i < num_input_fields; i++) { 2309 input_matrix_reuse[i].index = -1; 2310 } 2311 for (CeedInt i = 0; i < num_input_fields; i++) { 2312 bool is_tensor = true; 2313 CeedEvalMode eval_mode_i; 2314 CeedBasis basis_i; 2315 2316 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 2317 if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 2318 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 2319 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 2320 for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 2321 CeedEvalMode eval_mode_j; 2322 CeedBasis basis_j; 2323 2324 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 2325 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 2326 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 2327 if (basis_i == basis_j) { 2328 if (is_tensor) { 2329 input_matrix_reuse[i].index = j; 2330 input_matrix_reuse[i].is_input = true; 2331 input_matrix_reuse[i].eval_mode = eval_mode_j; 2332 } else { 2333 // For non-tensor can only re-use with the same eval mode 2334 if (eval_mode_i == eval_mode_j) { 2335 input_matrix_reuse[i].index = j; 2336 input_matrix_reuse[i].is_input = true; 2337 input_matrix_reuse[i].eval_mode = eval_mode_j; 2338 } 2339 } 2340 } 2341 CeedCallBackend(CeedBasisDestroy(&basis_j)); 2342 } 2343 CeedCallBackend(CeedBasisDestroy(&basis_i)); 2344 } 2345 2346 // -- Determine output mat reuse 2347 FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 2348 2349 for (CeedInt i = 0; i < num_output_fields; i++) { 2350 output_matrix_reuse[i].index = -1; 2351 } 2352 for (CeedInt i = 0; i < num_output_fields; i++) { 2353 bool is_tensor = true; 2354 CeedEvalMode eval_mode_i; 2355 CeedBasis basis_i; 2356 2357 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 2358 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 2359 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 2360 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 2361 CeedEvalMode eval_mode_j; 2362 CeedBasis basis_j; 2363 2364 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 2365 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 2366 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 2367 if (basis_i == basis_j) { 2368 if (is_tensor) { 2369 output_matrix_reuse[i].index = j; 2370 output_matrix_reuse[i].is_input = true; 2371 output_matrix_reuse[i].eval_mode = eval_mode_j; 2372 } else { 2373 // For non-tensor can only re-use with the same eval mode 2374 if (eval_mode_i == eval_mode_j) { 2375 output_matrix_reuse[i].index = j; 2376 output_matrix_reuse[i].is_input = true; 2377 output_matrix_reuse[i].eval_mode = eval_mode_j; 2378 } 2379 } 2380 } 2381 CeedCallBackend(CeedBasisDestroy(&basis_j)); 2382 } 2383 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 2384 CeedEvalMode eval_mode_j; 2385 CeedBasis basis_j; 2386 2387 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 2388 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 2389 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 2390 if (basis_i == basis_j) { 2391 if (is_tensor) { 2392 output_matrix_reuse[i].index = j; 2393 output_matrix_reuse[i].is_input = false; 2394 output_matrix_reuse[i].eval_mode = eval_mode_j; 2395 } else { 2396 // For non-tensor can only re-use with the same eval mode 2397 if (eval_mode_i == eval_mode_j) { 2398 output_matrix_reuse[i].index = j; 2399 output_matrix_reuse[i].is_input = false; 2400 output_matrix_reuse[i].eval_mode = eval_mode_j; 2401 } 2402 } 2403 } 2404 CeedCallBackend(CeedBasisDestroy(&basis_j)); 2405 } 2406 CeedCallBackend(CeedBasisDestroy(&basis_i)); 2407 } 2408 2409 // Initialize constants, and matrices B and G 2410 code << "\n" << tab << "// Input field constants and basis data\n"; 2411 for (CeedInt i = 0; i < num_input_fields; i++) { 2412 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 2413 max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true)); 2414 } 2415 code << "\n" << tab << "// Output field constants and basis data\n"; 2416 for (CeedInt i = 0; i < num_output_fields; i++) { 2417 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 2418 max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true)); 2419 } 2420 2421 // Loop over all elements 2422 code << "\n" << tab << "// Element loop\n"; 2423 code << tab << "__syncthreads();\n"; 2424 code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 2425 tab.push(); 2426 2427 // -- Compute minimum buffer space needed 2428 CeedInt max_rstr_buffer_size = 1; 2429 2430 for (CeedInt i = 0; i < num_input_fields; i++) { 2431 CeedEvalMode eval_mode; 2432 2433 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 2434 if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 2435 CeedInt num_comp; 2436 CeedElemRestriction elem_rstr; 2437 2438 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 2439 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 2440 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 2441 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2442 } 2443 } 2444 for (CeedInt i = 0; i < num_output_fields; i++) { 2445 CeedEvalMode eval_mode; 2446 2447 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 2448 if (eval_mode != CEED_EVAL_NONE) { 2449 CeedInt num_comp; 2450 CeedElemRestriction elem_rstr; 2451 2452 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 2453 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 2454 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 2455 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2456 } 2457 } 2458 code << tab << "// Scratch restriction buffer space\n"; 2459 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 2460 2461 // -- Determine best input field processing order 2462 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 2463 2464 for (CeedInt i = 0; i < num_input_fields; i++) { 2465 field_rstr_in_buffer[i] = -1; 2466 input_field_order[i] = -1; 2467 } 2468 { 2469 bool is_ordered[CEED_FIELD_MAX]; 2470 CeedInt curr_index = 0; 2471 2472 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 2473 for (CeedInt i = 0; i < num_input_fields; i++) { 2474 CeedVector vec_i; 2475 CeedElemRestriction rstr_i; 2476 2477 if (is_ordered[i]) continue; 2478 field_rstr_in_buffer[i] = i; 2479 is_ordered[i] = true; 2480 input_field_order[curr_index] = i; 2481 curr_index++; 2482 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 2483 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 2484 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 2485 for (CeedInt j = i + 1; j < num_input_fields; j++) { 2486 CeedVector vec_j; 2487 CeedElemRestriction rstr_j; 2488 2489 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 2490 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 2491 if (rstr_i == rstr_j && vec_i == vec_j) { 2492 field_rstr_in_buffer[j] = i; 2493 is_ordered[j] = true; 2494 input_field_order[curr_index] = j; 2495 curr_index++; 2496 } 2497 CeedCallBackend(CeedVectorDestroy(&vec_j)); 2498 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 2499 } 2500 CeedCallBackend(CeedVectorDestroy(&vec_i)); 2501 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 2502 } 2503 } 2504 2505 // -- Input restriction and basis 2506 code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 2507 CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0; 2508 CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX]; 2509 2510 for (CeedInt i = 0; i < num_input_fields; i++) { 2511 bool is_active = false; 2512 const char *field_name; 2513 const CeedInt f = input_field_order[i]; 2514 2515 { 2516 CeedVector vec; 2517 2518 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 2519 is_active = vec == CEED_VECTOR_ACTIVE; 2520 CeedCallBackend(CeedVectorDestroy(&vec)); 2521 } 2522 2523 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 2524 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 2525 2526 if (is_active) { 2527 CeedEvalMode eval_mode; 2528 CeedInt field_size; 2529 2530 active_fields_in[num_active_in] = f; 2531 num_active_in++; 2532 CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size)); 2533 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode)); 2534 if (eval_mode == CEED_EVAL_GRAD) { 2535 code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*" 2536 << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; 2537 } else { 2538 code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; 2539 } 2540 code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n"; 2541 } else { 2542 // ---- Restriction 2543 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 2544 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 2545 2546 // ---- Basis action 2547 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 2548 is_all_tensor, is_at_points, use_3d_slices)); 2549 } 2550 } 2551 code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {"; 2552 for (CeedInt i = 0; i < num_active_in; i++) { 2553 code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); 2554 } 2555 code << "};\n"; 2556 code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {"; 2557 for (CeedInt i = 0; i < num_active_in; i++) { 2558 code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); 2559 } 2560 code << "};\n"; 2561 2562 for (CeedInt i = 0; i < num_output_fields; i++) { 2563 bool is_active = false; 2564 2565 { 2566 CeedVector vec; 2567 2568 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 2569 is_active = vec == CEED_VECTOR_ACTIVE; 2570 CeedCallBackend(CeedVectorDestroy(&vec)); 2571 } 2572 if (is_active) { 2573 const char *field_name; 2574 CeedInt field_size; 2575 2576 active_fields_out[num_active_out] = i; 2577 num_active_out++; 2578 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); 2579 qf_assembly_size_out += field_size; 2580 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 2581 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 2582 code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n"; 2583 } 2584 } 2585 code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {"; 2586 for (CeedInt i = 0; i < num_active_out; i++) { 2587 code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : ""); 2588 } 2589 code << "};\n"; 2590 code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n"; 2591 2592 // -- Loop over active field 2593 code << "\n" << tab << "CeedInt input_offset = 0;\n"; 2594 code << tab << "// Loop over active QFunction input fields\n"; 2595 code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n"; 2596 code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n"; 2597 tab.push(); 2598 2599 // -- Loop over size of active field 2600 code << "\n" << tab << "// Loop over current active input field size\n"; 2601 code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n"; 2602 code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n"; 2603 tab.push(); 2604 2605 // -- Set current active point and component to 1 2606 code << tab << "// Set current active point and component to 1.0\n"; 2607 if (is_all_tensor && (max_dim >= 3)) { 2608 code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n"; 2609 } else { 2610 code << tab << "r_q_in[a][s] = 1.0;\n"; 2611 } 2612 2613 // -- Q function 2614 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 2615 qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 2616 Q_1d, is_all_tensor, is_at_points, use_3d_slices, true)); 2617 2618 // -- Output basis and restriction 2619 code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 2620 CeedScalar offset = 0; 2621 2622 for (CeedInt i = 0; i < num_output_fields; i++) { 2623 bool is_active = false; 2624 const char *field_name; 2625 2626 { 2627 CeedVector vec; 2628 2629 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 2630 is_active = vec == CEED_VECTOR_ACTIVE; 2631 CeedCallBackend(CeedVectorDestroy(&vec)); 2632 } 2633 if (!is_active) continue; 2634 2635 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 2636 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 2637 2638 // ---- Restriction 2639 CeedInt field_size; 2640 2641 code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", " 2642 << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n"; 2643 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); 2644 offset += field_size; 2645 } 2646 2647 // -- Reset current active node and component 2648 code << "\n" << tab << "// Reset current active node and component to 0.0\n"; 2649 if (is_all_tensor && (max_dim >= 3)) { 2650 code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n"; 2651 } else { 2652 code << tab << "r_q_in[a][s] = 0.0;\n"; 2653 } 2654 2655 // -- End of loop over size of active field 2656 tab.pop(); 2657 code << tab << "}\n"; 2658 code << tab << "input_offset += field_size_in;\n"; 2659 2660 // -- End of loop over active field 2661 tab.pop(); 2662 code << tab << "}\n"; 2663 2664 // Close loop and function 2665 tab.pop(); 2666 code << tab << "}\n"; 2667 tab.pop(); 2668 code << tab << "}\n"; 2669 code << tab << "// -----------------------------------------------------------------------------\n\n"; 2670 2671 CeedInt block_sizes[3] = {0, 0, 0}; 2672 CeedInt num_elem; 2673 2674 // Compile 2675 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 2676 CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); 2677 { 2678 bool is_compile_good = false; 2679 2680 data->thread_1d = block_sizes[0]; 2681 CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0], 2682 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2])); 2683 if (is_compile_good) { 2684 *is_good_build = true; 2685 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction)); 2686 } else { 2687 *is_good_build = false; 2688 data->use_assembly_fallback = true; 2689 } 2690 } 2691 CeedCallBackend(CeedDestroy(&ceed)); 2692 CeedCallBackend(CeedQFunctionDestroy(&qf)); 2693 return CEED_ERROR_SUCCESS; 2694 } 2695 2696 //------------------------------------------------------------------------------ 2697