1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #define CEED_DEBUG_COLOR 12 9 10 #include <ceed.h> 11 #include <ceed/backend.h> 12 #include <ceed/gen-tools.h> 13 #include <ceed/jit-tools.h> 14 15 #include <iostream> 16 #include <sstream> 17 #include <string> 18 19 #include "../hip-ref/ceed-hip-ref.h" 20 #include "../hip-shared/ceed-hip-shared.h" 21 #include "../hip/ceed-hip-common.h" 22 #include "../hip/ceed-hip-compile.h" 23 #include "ceed-hip-gen.h" 24 25 struct FieldReuse_Hip { 26 CeedInt index; 27 bool is_input; 28 CeedEvalMode eval_mode; 29 }; 30 31 //------------------------------------------------------------------------------ 32 // Calculate the block size used for launching the operator kernel 33 //------------------------------------------------------------------------------ 34 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { 35 const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 36 if (dim == 1) { 37 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 38 39 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 40 block_sizes[0] = thread_1d; 41 block_sizes[1] = 1; 42 block_sizes[2] = elems_per_block; 43 } else if (dim == 2) { 44 const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2; 45 46 block_sizes[0] = thread_1d; 47 block_sizes[1] = thread_1d; 48 block_sizes[2] = elems_per_block; 49 } else if (dim == 3) { 50 const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1); 51 52 block_sizes[0] = thread_1d; 53 block_sizes[1] = thread_1d; 54 block_sizes[2] = elems_per_block; 55 } 56 return CEED_ERROR_SUCCESS; 57 } 58 59 //------------------------------------------------------------------------------ 60 // Determine type of operator 61 //------------------------------------------------------------------------------ 62 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 63 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields, 64 CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d, 65 CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) { 66 // Check if all are tensor 67 *is_all_tensor = true; 68 for (CeedInt i = 0; i < num_input_fields; i++) { 69 CeedBasis basis; 70 71 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 72 if (basis != CEED_BASIS_NONE) { 73 bool is_field_tensor; 74 75 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 76 *is_all_tensor = *is_all_tensor && is_field_tensor; 77 } 78 CeedCallBackend(CeedBasisDestroy(&basis)); 79 } 80 for (CeedInt i = 0; i < num_output_fields; i++) { 81 CeedBasis basis; 82 83 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 84 if (basis != CEED_BASIS_NONE) { 85 bool is_field_tensor; 86 87 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 88 *is_all_tensor = *is_all_tensor && is_field_tensor; 89 } 90 CeedCallBackend(CeedBasisDestroy(&basis)); 91 } 92 93 // Find max_P, max_P_1d, Q, and Q_1d 94 bool is_all_3d = true; 95 96 *max_P = 0; 97 *max_P_1d = 0; 98 *Q = 0; 99 *Q_1d = 0; 100 for (CeedInt i = 0; i < num_input_fields; i++) { 101 CeedBasis basis; 102 103 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 104 if (basis != CEED_BASIS_NONE) { 105 bool is_field_tensor; 106 CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; 107 108 // Check if 3D 109 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 110 is_all_3d = is_all_3d && (field_dim == 3); 111 *max_dim = CeedIntMax(*max_dim, field_dim); 112 113 // Collect P, P_1d, Q, and Q_1d 114 CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); 115 *max_P = CeedIntMax(*max_P, field_P); 116 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 117 if (is_field_tensor) { 118 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 119 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 120 } 121 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); 122 CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 123 *Q = field_Q; 124 if (is_field_tensor) { 125 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 126 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 127 *Q_1d = field_Q_1d; 128 } 129 } 130 CeedCallBackend(CeedBasisDestroy(&basis)); 131 } 132 for (CeedInt i = 0; i < num_output_fields; i++) { 133 CeedBasis basis; 134 135 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 136 if (basis != CEED_BASIS_NONE) { 137 bool is_field_tensor; 138 CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; 139 140 // Check if 3D 141 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 142 is_all_3d = is_all_3d && (field_dim == 3); 143 *max_dim = CeedIntMax(*max_dim, field_dim); 144 145 // Collect P, P_1d, Q, and Q_1d 146 CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); 147 *max_P = CeedIntMax(*max_P, field_P); 148 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 149 if (is_field_tensor) { 150 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 151 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 152 } 153 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); 154 CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 155 *Q = field_Q; 156 if (is_field_tensor) { 157 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 158 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 159 *Q_1d = field_Q_1d; 160 } 161 } 162 CeedCallBackend(CeedBasisDestroy(&basis)); 163 } 164 165 // Only use 3D collocated gradient parallelization strategy when gradient is computed 166 *use_3d_slices = false; 167 if (is_all_3d && *is_all_tensor) { 168 bool was_grad_found = false; 169 170 for (CeedInt i = 0; i < num_input_fields; i++) { 171 CeedEvalMode eval_mode; 172 173 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 174 if (eval_mode == CEED_EVAL_GRAD) { 175 CeedBasis_Hip_shared *basis_data; 176 CeedBasis basis; 177 178 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 179 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 180 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 181 was_grad_found = true; 182 CeedCallBackend(CeedBasisDestroy(&basis)); 183 } 184 } 185 for (CeedInt i = 0; i < num_output_fields; i++) { 186 CeedEvalMode eval_mode; 187 188 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 189 if (eval_mode == CEED_EVAL_GRAD) { 190 CeedBasis_Hip_shared *basis_data; 191 CeedBasis basis; 192 193 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 194 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 195 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 196 was_grad_found = true; 197 CeedCallBackend(CeedBasisDestroy(&basis)); 198 } 199 } 200 } 201 return CEED_ERROR_SUCCESS; 202 } 203 204 //------------------------------------------------------------------------------ 205 // Setup fields 206 //------------------------------------------------------------------------------ 207 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, 208 CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse, 209 CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, 210 bool use_3d_slices, bool skip_active_load) { 211 bool is_tensor = true, is_active = true; 212 CeedBasis basis; 213 214 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 215 if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 216 { 217 CeedVector vec; 218 219 CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec)); 220 is_active = vec == CEED_VECTOR_ACTIVE; 221 CeedCallBackend(CeedVectorDestroy(&vec)); 222 } 223 224 const char *field_name; 225 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 226 std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 227 std::string option_name = (is_input ? "inputs" : "outputs"); 228 CeedEvalMode eval_mode = CEED_EVAL_NONE; 229 CeedInt elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0; 230 CeedElemRestriction elem_rstr; 231 CeedBasis_Hip_shared *basis_data; 232 233 // Field reuse info 234 bool use_previous_field = field_reuse.index != -1; 235 236 CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name)); 237 code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n"; 238 239 // Get field data 240 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 241 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 242 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 243 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 244 } 245 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 246 if (basis != CEED_BASIS_NONE) { 247 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 248 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 249 if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 250 else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 251 } 252 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 253 254 // Set field constants 255 code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n"; 256 if (is_tensor && !is_all_tensor) { 257 CeedInt P = 0; 258 259 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 260 code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n"; 261 } 262 code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n"; 263 if (eval_mode != CEED_EVAL_WEIGHT) { 264 code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; 265 } 266 267 // Load basis data 268 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 269 switch (eval_mode) { 270 case CEED_EVAL_NONE: 271 break; 272 case CEED_EVAL_INTERP: 273 if (is_at_points) { 274 // AtPoints 275 if (!basis_data->d_chebyshev_interp_1d) { 276 CeedSize interp_bytes; 277 CeedScalar *chebyshev_interp_1d; 278 279 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 280 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 281 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 282 CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 283 CeedCallHip(CeedBasisReturnCeed(basis), 284 hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 285 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 286 } 287 if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 288 else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 289 } else { 290 // Standard quadrature 291 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 292 else data->B.outputs[i] = basis_data->d_interp_1d; 293 } 294 if (use_previous_field && !skip_active_load) { 295 std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 296 297 code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; 298 } else { 299 bool is_collocated = false; 300 301 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 302 if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { 303 code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; 304 } else { 305 code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 306 code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 307 } 308 } 309 break; 310 case CEED_EVAL_GRAD: 311 if (is_at_points) { 312 // AtPoints 313 if (!basis_data->d_chebyshev_interp_1d) { 314 CeedSize interp_bytes; 315 CeedScalar *chebyshev_interp_1d; 316 317 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 318 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 319 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 320 CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 321 CeedCallHip(CeedBasisReturnCeed(basis), 322 hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 323 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 324 } 325 if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 326 else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 327 } else { 328 // Standard quadrature 329 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 330 else data->B.outputs[i] = basis_data->d_interp_1d; 331 } 332 if (is_tensor) { 333 if (use_previous_field && !skip_active_load) { 334 std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 335 336 code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; 337 } else { 338 bool is_collocated = false; 339 340 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 341 if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { 342 code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; 343 } else { 344 code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 345 code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 346 } 347 } 348 } 349 if (is_at_points) break; // No G mat for AtPoints 350 if (use_3d_slices) { 351 if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d; 352 else data->G.outputs[i] = basis_data->d_collo_grad_1d; 353 if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 354 std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 355 356 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 357 } else if (is_active && skip_active_load) { 358 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 359 } else { 360 code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 361 code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 362 } 363 } else { 364 bool has_collo_grad = basis_data->d_collo_grad_1d; 365 366 if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 367 else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 368 if (has_collo_grad) { 369 if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 370 std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 371 372 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 373 } else if (is_active && skip_active_load) { 374 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 375 } else { 376 code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 377 code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 378 } 379 } else { 380 if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 381 std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 382 383 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 384 } else if (is_active && skip_active_load) { 385 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 386 } else { 387 code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") 388 << (is_tensor ? "" : var_suffix) << "];\n"; 389 code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G." 390 << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 391 } 392 } 393 } 394 break; 395 case CEED_EVAL_WEIGHT: 396 break; // No action 397 // LCOV_EXCL_START 398 case CEED_EVAL_DIV: 399 case CEED_EVAL_CURL: 400 break; // TODO: Not implemented 401 // LCOV_EXCL_STOP 402 } 403 CeedCallBackend(CeedBasisDestroy(&basis)); 404 return CEED_ERROR_SUCCESS; 405 } 406 407 //------------------------------------------------------------------------------ 408 // Restriction 409 //------------------------------------------------------------------------------ 410 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, 411 CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, 412 CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, 413 bool use_3d_slices) { 414 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 415 std::string P_name = (is_all_tensor ? "P_1d" : "P") + var_suffix; 416 CeedEvalMode eval_mode = CEED_EVAL_NONE; 417 CeedInt elem_size = 0, num_comp = 0; 418 CeedSize l_size; 419 CeedRestrictionType rstr_type = CEED_RESTRICTION_STANDARD; 420 CeedElemRestriction_Hip *rstr_data; 421 CeedElemRestriction elem_rstr; 422 423 // Get field data 424 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 425 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 426 CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); 427 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 428 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 429 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 430 } 431 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 432 433 // Restriction 434 if (is_input) { 435 // Input 436 if (field_input_buffer[i] != i) { 437 std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]); 438 439 // Restriction was already done for previous input 440 code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n"; 441 } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) { 442 if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) { 443 // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated 444 code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n"; 445 } else if (rstr_type != CEED_RESTRICTION_POINTS) { 446 // Otherwise we're using the scratch space 447 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 448 } 449 switch (rstr_type) { 450 case CEED_RESTRICTION_STANDARD: { 451 CeedInt comp_stride; 452 453 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 454 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 455 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 456 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 457 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 458 code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " 459 << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix 460 << ");\n"; 461 break; 462 } 463 case CEED_RESTRICTION_STRIDED: { 464 bool has_backend_strides; 465 CeedInt num_elem; 466 467 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 468 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 469 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 470 471 if (!has_backend_strides) { 472 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 473 } 474 code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 475 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 476 code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides" 477 << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e" 478 << var_suffix << ");\n"; 479 break; 480 } 481 case CEED_RESTRICTION_POINTS: { 482 CeedInt comp_stride; 483 484 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 485 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 486 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 487 break; 488 } 489 // LCOV_EXCL_START 490 case CEED_RESTRICTION_ORIENTED: 491 case CEED_RESTRICTION_CURL_ORIENTED: 492 break; // TODO: Not implemented 493 // LCOV_EXCL_STOP 494 } 495 } 496 } else { 497 // Output 498 switch (rstr_type) { 499 case CEED_RESTRICTION_STANDARD: { 500 CeedInt comp_stride; 501 502 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 503 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 504 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 505 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 506 data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 507 code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " 508 << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix 509 << ");\n"; 510 break; 511 } 512 case CEED_RESTRICTION_STRIDED: { 513 bool has_backend_strides; 514 CeedInt num_elem; 515 516 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 517 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 518 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 519 520 if (!has_backend_strides) { 521 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 522 } 523 code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 524 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 525 code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides" 526 << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix 527 << ");\n"; 528 break; 529 } 530 case CEED_RESTRICTION_POINTS: 531 data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 532 break; 533 // LCOV_EXCL_START 534 case CEED_RESTRICTION_ORIENTED: 535 case CEED_RESTRICTION_CURL_ORIENTED: 536 break; // TODO: Not implemented 537 // LCOV_EXCL_STOP 538 } 539 } 540 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 541 return CEED_ERROR_SUCCESS; 542 } 543 544 //------------------------------------------------------------------------------ 545 // Basis 546 //------------------------------------------------------------------------------ 547 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field, 548 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, 549 bool is_at_points, bool use_3d_slices) { 550 bool is_tensor = true, is_collocated = true; 551 CeedBasis basis; 552 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 553 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 554 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 555 556 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 557 std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 558 CeedEvalMode eval_mode = CEED_EVAL_NONE; 559 CeedInt dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0; 560 CeedElemRestriction elem_rstr; 561 562 // Get field data 563 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 564 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 565 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 566 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 567 } 568 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 569 if (basis != CEED_BASIS_NONE) { 570 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 571 if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 572 else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 573 } 574 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 575 576 // Basis 577 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 578 if (is_input) { 579 switch (eval_mode) { 580 case CEED_EVAL_NONE: 581 if (!use_3d_slices && !is_at_points) { 582 code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n"; 583 } 584 break; 585 case CEED_EVAL_INTERP: 586 if (is_at_points) { 587 std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 588 589 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 590 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 591 << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 592 } else { 593 std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 594 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 595 : "InterpNonTensor"; 596 std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 597 598 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 599 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e" 600 << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 601 } 602 break; 603 case CEED_EVAL_GRAD: 604 if (is_at_points) { 605 std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 606 607 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 608 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 609 << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 610 } else if (use_3d_slices) { 611 std::string function_name = 612 (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d"; 613 614 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 615 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 616 << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 617 } else if (is_tensor) { 618 bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; 619 std::string function_name = 620 (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + 621 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); 622 std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 623 624 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" 625 << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n"; 626 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e" 627 << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 628 } else { 629 std::string function_name = "GradNonTensor"; 630 631 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 632 code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name 633 << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 634 } 635 break; 636 case CEED_EVAL_WEIGHT: { 637 if (is_at_points) { 638 code << tab << "// Nothing to do AtPoints\n"; 639 } else { 640 CeedBasis_Hip_shared *basis_data; 641 std::string function_name = is_tensor 642 ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 643 : "WeightNonTensor"; 644 645 code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 646 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 647 data->W = basis_data->d_q_weight_1d; 648 code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n"; 649 } 650 break; 651 } 652 // LCOV_EXCL_START 653 case CEED_EVAL_DIV: 654 case CEED_EVAL_CURL: 655 break; // TODO: Not implemented 656 // LCOV_EXCL_STOP 657 } 658 } else { 659 switch (eval_mode) { 660 case CEED_EVAL_NONE: 661 code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n"; 662 break; // No action 663 case CEED_EVAL_INTERP: 664 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 665 if (is_at_points) { 666 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 667 668 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix 669 << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 670 } else { 671 std::string function_name = 672 is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 673 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 674 : "InterpTransposeNonTensor"; 675 std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 676 677 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q" 678 << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 679 } 680 break; 681 case CEED_EVAL_GRAD: 682 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 683 if (is_at_points) { 684 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 685 686 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix 687 << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 688 } else if (use_3d_slices) { 689 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 690 std::to_string(dim) + "d"; 691 692 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix 693 << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 694 } else if (is_tensor) { 695 bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; 696 std::string function_name = 697 (dim == 1 ? "GradTranspose" 698 : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + 699 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); 700 std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 701 702 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q" 703 << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 704 } else { 705 std::string function_name = "GradTransposeNonTensor"; 706 707 code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name 708 << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 709 } 710 break; 711 // LCOV_EXCL_START 712 case CEED_EVAL_WEIGHT: 713 break; // Should not occur 714 case CEED_EVAL_DIV: 715 case CEED_EVAL_CURL: 716 break; // TODO: Not implemented 717 // LCOV_EXCL_STOP 718 } 719 } 720 CeedCallBackend(CeedBasisDestroy(&basis)); 721 return CEED_ERROR_SUCCESS; 722 } 723 724 //------------------------------------------------------------------------------ 725 // QFunction 726 //------------------------------------------------------------------------------ 727 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim, 728 CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 729 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, 730 CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields, 731 std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points, 732 bool use_3d_slices) { 733 std::string Q_name = is_all_tensor ? "Q_1d" : "Q"; 734 CeedEvalMode eval_mode = CEED_EVAL_NONE; 735 CeedElemRestriction elem_rstr; 736 737 // Setup output arrays 738 code << "\n"; 739 code << tab << "// -- Output field setup\n"; 740 for (CeedInt i = 0; i < num_output_fields; i++) { 741 const char *field_name; 742 std::string var_suffix = "_out_" + std::to_string(i); 743 744 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 745 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 746 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 747 switch (eval_mode) { 748 case CEED_EVAL_NONE: 749 if (is_at_points) { 750 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n"; 751 } else { 752 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") 753 << "];\n"; 754 } 755 break; 756 case CEED_EVAL_INTERP: 757 if (is_at_points) { 758 // Accumulator for point data 759 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; 760 code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix 761 << "[i] = 0.0;\n"; 762 } else { 763 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") 764 << "];\n"; 765 } 766 break; 767 case CEED_EVAL_GRAD: 768 if (is_at_points) { 769 // Accumulator for point data 770 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; 771 code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix 772 << "[i] = 0.0;\n"; 773 } else if (use_3d_slices) { 774 // Accumulator for gradient slices 775 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 776 code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n"; 777 } else { 778 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" 779 << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n"; 780 } 781 break; 782 case CEED_EVAL_WEIGHT: 783 break; 784 // LCOV_EXCL_START 785 case CEED_EVAL_DIV: 786 case CEED_EVAL_CURL: 787 break; // TODO: Not implemented 788 // LCOV_EXCL_STOP 789 } 790 } 791 792 if (is_at_points) { 793 // We need to handle batches of points 794 code << "\n"; 795 code << tab << "// Note: Using batches of points\n"; 796 code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n"; 797 code << tab << "#pragma unroll\n"; 798 code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n"; 799 tab.push(); 800 code << tab << "const CeedInt p = i % max_num_points;\n\n"; 801 802 code << tab << "// -- Coordinates\n"; 803 code << tab << "CeedScalar r_x[max_dim];\n"; 804 code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n"; 805 806 code << tab << "// -- Input fields\n"; 807 for (CeedInt i = 0; i < num_input_fields; i++) { 808 const char *field_name; 809 std::string var_suffix = "_in_" + std::to_string(i); 810 std::string P_name = "P_1d" + var_suffix; 811 812 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 813 code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 814 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 815 // Basis action 816 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 817 switch (eval_mode) { 818 case CEED_EVAL_NONE: 819 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 820 code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 821 << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 822 break; 823 case CEED_EVAL_INTERP: 824 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 825 code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 826 << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; 827 break; 828 case CEED_EVAL_GRAD: 829 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 830 code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 831 << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; 832 break; 833 case CEED_EVAL_WEIGHT: 834 code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; 835 code << tab << "r_s" << var_suffix << "[0] = 1.0;\n"; 836 break; 837 // LCOV_EXCL_START 838 case CEED_EVAL_DIV: 839 case CEED_EVAL_CURL: 840 break; // TODO: Not implemented 841 // LCOV_EXCL_STOP 842 } 843 } 844 code << "\n"; 845 code << tab << "// -- Output fields\n"; 846 for (CeedInt i = 0; i < num_output_fields; i++) { 847 const char *field_name; 848 std::string var_suffix = "_out_" + std::to_string(i); 849 850 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 851 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 852 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 853 // Basis action 854 switch (eval_mode) { 855 case CEED_EVAL_NONE: 856 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 857 break; 858 case CEED_EVAL_INTERP: 859 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 860 break; 861 case CEED_EVAL_GRAD: 862 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 863 break; 864 // LCOV_EXCL_START 865 case CEED_EVAL_WEIGHT: 866 break; // Should not occur 867 case CEED_EVAL_DIV: 868 case CEED_EVAL_CURL: 869 break; // TODO: Not implemented 870 // LCOV_EXCL_STOP 871 } 872 } 873 874 } else if (use_3d_slices) { 875 // We treat quadrature points per slice in 3d to save registers 876 code << "\n"; 877 code << tab << "// Note: Using planes of 3D elements\n"; 878 code << tab << "#pragma unroll\n"; 879 code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n"; 880 tab.push(); 881 code << tab << "// -- Input fields\n"; 882 for (CeedInt i = 0; i < num_input_fields; i++) { 883 const char *field_name; 884 std::string var_suffix = "_in_" + std::to_string(i); 885 886 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 887 code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 888 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 889 // Basis action 890 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 891 switch (eval_mode) { 892 case CEED_EVAL_NONE: 893 bool is_strided; 894 895 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 896 897 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 898 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 899 if (is_strided) { 900 bool has_backend_strides; 901 CeedInt num_elem, elem_size; 902 903 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 904 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 905 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 906 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 907 908 if (!has_backend_strides) { 909 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 910 } 911 code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 912 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 913 code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides" 914 << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n"; 915 } else { 916 CeedSize l_size = 0; 917 CeedInt comp_stride; 918 CeedElemRestriction_Hip *rstr_data; 919 920 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 921 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 922 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 923 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 924 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 925 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 926 code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size" 927 << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 928 } 929 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 930 break; 931 case CEED_EVAL_INTERP: 932 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 933 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n"; 934 tab.push(); 935 code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n"; 936 tab.pop(); 937 code << tab << "}\n"; 938 break; 939 case CEED_EVAL_GRAD: 940 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 941 code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G" 942 << var_suffix << ", r_s" << var_suffix << ");\n"; 943 break; 944 case CEED_EVAL_WEIGHT: 945 code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; 946 code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n"; 947 break; 948 // LCOV_EXCL_START 949 case CEED_EVAL_DIV: 950 case CEED_EVAL_CURL: 951 break; // TODO: Not implemented 952 // LCOV_EXCL_STOP 953 } 954 } 955 code << "\n"; 956 code << tab << "// -- Output fields\n"; 957 for (CeedInt i = 0; i < num_output_fields; i++) { 958 const char *field_name; 959 std::string var_suffix = "_out_" + std::to_string(i); 960 961 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 962 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 963 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 964 // Basis action 965 switch (eval_mode) { 966 case CEED_EVAL_NONE: 967 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 968 break; 969 case CEED_EVAL_INTERP: 970 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 971 break; 972 case CEED_EVAL_GRAD: 973 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 974 break; 975 // LCOV_EXCL_START 976 case CEED_EVAL_WEIGHT: 977 break; // Should not occur 978 case CEED_EVAL_DIV: 979 case CEED_EVAL_CURL: 980 break; // TODO: Not implemented 981 // LCOV_EXCL_STOP 982 } 983 } 984 } else { 985 code << "\n"; 986 code << tab << "// Note: Using full elements\n"; 987 code << tab << "{\n"; 988 tab.push(); 989 code << tab << "// -- Input fields\n"; 990 for (CeedInt i = 0; i < num_input_fields; i++) { 991 const char *field_name; 992 993 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 994 code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 995 code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n"; 996 } 997 code << tab << "// -- Output fields\n"; 998 for (CeedInt i = 0; i < num_output_fields; i++) { 999 const char *field_name; 1000 1001 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1002 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1003 code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n"; 1004 } 1005 } 1006 1007 // Input and output buffers 1008 code << "\n"; 1009 code << tab << "// -- QFunction inputs and outputs\n"; 1010 code << tab << "// ---- Inputs\n"; 1011 code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; 1012 for (CeedInt i = 0; i < num_input_fields; i++) { 1013 const char *field_name; 1014 1015 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 1016 code << tab << "// ------ Input field " << i << ": " << field_name << "\n"; 1017 code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n"; 1018 } 1019 code << tab << "// ---- Outputs\n"; 1020 code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; 1021 for (CeedInt i = 0; i < num_output_fields; i++) { 1022 const char *field_name; 1023 1024 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1025 code << tab << "// ------ Output field " << i << ": " << field_name << "\n"; 1026 code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n"; 1027 } 1028 1029 // Apply QFunction 1030 code << "\n"; 1031 code << tab << "// -- Apply QFunction\n"; 1032 code << tab << "" << qfunction_name << "(ctx, "; 1033 if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 1034 code << "1"; 1035 } else { 1036 code << Q_name; 1037 } 1038 code << ", inputs, outputs);\n"; 1039 1040 if (is_at_points) { 1041 // Map back to coefficients 1042 code << "\n"; 1043 code << tab << "// -- Output fields\n"; 1044 for (CeedInt i = 0; i < num_output_fields; i++) { 1045 const char *field_name; 1046 std::string var_suffix = "_out_" + std::to_string(i); 1047 std::string P_name = "P_1d" + var_suffix; 1048 1049 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1050 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1051 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1052 // Basis action 1053 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 1054 switch (eval_mode) { 1055 case CEED_EVAL_NONE: { 1056 CeedInt comp_stride; 1057 CeedElemRestriction elem_rstr; 1058 1059 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1060 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 1061 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1062 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 1063 code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 1064 << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]" 1065 << ", r_s" << var_suffix << ", d" << var_suffix << ");\n"; 1066 break; 1067 } 1068 case CEED_EVAL_INTERP: 1069 code << tab << "if (i >= points.num_per_elem[elem]) {\n"; 1070 tab.push(); 1071 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 1072 tab.pop(); 1073 code << tab << "}\n"; 1074 code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 1075 << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 1076 break; 1077 case CEED_EVAL_GRAD: 1078 code << tab << "if (i >= points.num_per_elem[elem]) {\n"; 1079 tab.push(); 1080 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 1081 tab.pop(); 1082 code << tab << "}\n"; 1083 code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 1084 << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 1085 break; 1086 // LCOV_EXCL_START 1087 case CEED_EVAL_WEIGHT: 1088 break; // Should not occur 1089 case CEED_EVAL_DIV: 1090 case CEED_EVAL_CURL: 1091 break; // TODO: Not implemented 1092 // LCOV_EXCL_STOP 1093 } 1094 } 1095 } else if (use_3d_slices) { 1096 // Copy or apply transpose grad, if needed 1097 code << "\n"; 1098 code << tab << "// -- Output fields\n"; 1099 for (CeedInt i = 0; i < num_output_fields; i++) { 1100 const char *field_name; 1101 std::string var_suffix = "_out_" + std::to_string(i); 1102 std::string P_name = "P_1d" + var_suffix; 1103 1104 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1105 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1106 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1107 // Basis action 1108 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 1109 switch (eval_mode) { 1110 case CEED_EVAL_NONE: 1111 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 1112 tab.push(); 1113 code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 1114 tab.pop(); 1115 code << tab << "}\n"; 1116 break; 1117 case CEED_EVAL_INTERP: 1118 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 1119 tab.push(); 1120 code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 1121 tab.pop(); 1122 code << tab << "}\n"; 1123 break; 1124 case CEED_EVAL_GRAD: 1125 code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G" 1126 << var_suffix << ", r_q" << var_suffix << ");\n"; 1127 break; 1128 // LCOV_EXCL_START 1129 case CEED_EVAL_WEIGHT: 1130 break; // Should not occur 1131 case CEED_EVAL_DIV: 1132 case CEED_EVAL_CURL: 1133 break; // TODO: Not implemented 1134 // LCOV_EXCL_STOP 1135 } 1136 } 1137 } 1138 tab.pop(); 1139 code << tab << "}\n"; 1140 return CEED_ERROR_SUCCESS; 1141 } 1142 1143 //------------------------------------------------------------------------------ 1144 // Build single operator kernel 1145 //------------------------------------------------------------------------------ 1146 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) { 1147 bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; 1148 Ceed ceed; 1149 CeedInt Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; 1150 CeedQFunctionField *qf_input_fields, *qf_output_fields; 1151 CeedQFunction_Hip_gen *qf_data; 1152 CeedQFunction qf; 1153 CeedOperatorField *op_input_fields, *op_output_fields; 1154 CeedOperator_Hip_gen *data; 1155 std::ostringstream code; 1156 Tab tab; 1157 1158 CeedCallBackend(CeedOperatorGetData(op, &data)); 1159 { 1160 bool is_setup_done; 1161 1162 CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 1163 if (is_setup_done) { 1164 *is_good_build = !data->use_fallback; 1165 return CEED_ERROR_SUCCESS; 1166 } 1167 } 1168 1169 // Check field compatibility 1170 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 1171 { 1172 bool has_shared_bases = true; 1173 1174 for (CeedInt i = 0; i < num_input_fields; i++) { 1175 CeedBasis basis; 1176 1177 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 1178 if (basis != CEED_BASIS_NONE) { 1179 bool is_tensor = true; 1180 const char *resource; 1181 char *resource_root; 1182 Ceed basis_ceed; 1183 1184 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 1185 is_all_tensor = is_all_tensor && is_tensor; 1186 is_all_nontensor = is_all_nontensor && !is_tensor; 1187 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 1188 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 1189 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 1190 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 1191 CeedCallBackend(CeedFree(&resource_root)); 1192 CeedCallBackend(CeedDestroy(&basis_ceed)); 1193 } 1194 CeedCallBackend(CeedBasisDestroy(&basis)); 1195 } 1196 1197 for (CeedInt i = 0; i < num_output_fields; i++) { 1198 CeedBasis basis; 1199 1200 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 1201 if (basis != CEED_BASIS_NONE) { 1202 bool is_tensor = true; 1203 const char *resource; 1204 char *resource_root; 1205 Ceed basis_ceed; 1206 1207 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 1208 is_all_tensor = is_all_tensor && is_tensor; 1209 is_all_nontensor = is_all_nontensor && !is_tensor; 1210 1211 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 1212 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 1213 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 1214 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 1215 CeedCallBackend(CeedFree(&resource_root)); 1216 CeedCallBackend(CeedDestroy(&basis_ceed)); 1217 } 1218 CeedCallBackend(CeedBasisDestroy(&basis)); 1219 } 1220 // -- Fallback to ref if not all bases are shared 1221 if (!has_shared_bases) { 1222 *is_good_build = false; 1223 return CEED_ERROR_SUCCESS; 1224 } 1225 } 1226 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1227 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1228 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 1229 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 1230 1231 // Get operator data 1232 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 1233 { 1234 CeedInt max_P = 0, max_P_1d = 0; 1235 1236 CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields, 1237 qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices)); 1238 data->max_P_1d = is_all_tensor ? max_P_1d : max_P; 1239 } 1240 if (is_at_points) { 1241 CeedInt coords_dim = 0; 1242 CeedElemRestriction_Hip *rstr_data; 1243 CeedElemRestriction rstr_points = NULL; 1244 1245 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 1246 CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 1247 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 1248 CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim)); 1249 CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data)); 1250 data->points.indices = (CeedInt *)rstr_data->d_offsets; 1251 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 1252 if (max_dim == 0) max_dim = coords_dim; 1253 if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim)); 1254 } 1255 if (max_dim == 0) max_dim = 1; 1256 data->dim = max_dim; 1257 if (is_at_points) use_3d_slices = false; 1258 if (Q_1d == 0) { 1259 if (is_at_points) Q_1d = max_num_points; 1260 else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d)); 1261 } 1262 if (Q == 0) Q = Q_1d; 1263 data->Q = Q; 1264 data->Q_1d = Q_1d; 1265 1266 // Check for restriction only identity operator 1267 { 1268 bool is_identity_qf; 1269 1270 CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 1271 if (is_identity_qf) { 1272 CeedEvalMode eval_mode_in, eval_mode_out; 1273 1274 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); 1275 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); 1276 CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, 1277 "Backend does not implement restriction only identity operators"); 1278 } 1279 } 1280 1281 // Load basis source files 1282 if (!is_all_nontensor) { 1283 code << tab << "// Tensor basis source\n"; 1284 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 1285 } 1286 if (!is_all_tensor) { 1287 code << tab << "// Non-tensor basis source\n"; 1288 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n"; 1289 } 1290 if (is_at_points) { 1291 code << tab << "// AtPoints basis source\n"; 1292 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n"; 1293 } 1294 if (!is_all_tensor && !is_all_nontensor) { 1295 code << tab << "// Tensor basis source\n"; 1296 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n"; 1297 } 1298 code << tab << "// CodeGen operator source\n"; 1299 code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 1300 1301 // Get QFunction name 1302 std::string qfunction_name(qf_data->qfunction_name); 1303 std::string operator_name; 1304 1305 operator_name = "CeedKernelHipGenOperator_" + qfunction_name; 1306 1307 // Define CEED_Q_VLA 1308 code << "\n" << tab << "#undef CEED_Q_VLA\n"; 1309 if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 1310 code << tab << "#define CEED_Q_VLA 1\n\n"; 1311 } else { 1312 code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 1313 } 1314 1315 // Add user QFunction source 1316 { 1317 const char *source_path; 1318 1319 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 1320 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 1321 1322 code << tab << "// User QFunction source\n"; 1323 code << tab << "#include \"" << source_path << "\"\n\n"; 1324 } 1325 1326 // Setup 1327 code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 1328 code << tab << "// Operator Kernel\n"; 1329 code << tab << "// \n"; 1330 code << tab << "// d_[in,out]_i: CeedVector device array\n"; 1331 code << tab << "// r_[in,out]_e_i: Element vector register\n"; 1332 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 1333 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 1334 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 1335 code << tab << "// \n"; 1336 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 1337 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 1338 code << tab << "// -----------------------------------------------------------------------------\n"; 1339 code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; 1340 code << "__global__ void " << operator_name 1341 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n"; 1342 tab.push(); 1343 1344 // Scratch buffers 1345 for (CeedInt i = 0; i < num_input_fields; i++) { 1346 CeedEvalMode eval_mode; 1347 1348 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1349 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 1350 code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 1351 } 1352 } 1353 for (CeedInt i = 0; i < num_output_fields; i++) { 1354 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 1355 } 1356 1357 code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 1358 if (!is_all_tensor) { 1359 code << tab << "const CeedInt Q = " << Q << ";\n"; 1360 } 1361 if (!is_all_nontensor) { 1362 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 1363 } 1364 if (is_at_points) { 1365 code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; 1366 code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 1367 } 1368 1369 // Shared data 1370 code << tab << "extern __shared__ CeedScalar slice[];\n"; 1371 code << tab << "SharedData_Hip data;\n"; 1372 code << tab << "data.t_id_x = threadIdx.x;\n"; 1373 code << tab << "data.t_id_y = threadIdx.y;\n"; 1374 code << tab << "data.t_id_z = threadIdx.z;\n"; 1375 code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 1376 code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 1377 1378 // -- Determine input mat reuse 1379 FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 1380 1381 for (CeedInt i = 0; i < num_input_fields; i++) { 1382 input_matrix_reuse[i].index = -1; 1383 } 1384 for (CeedInt i = 0; i < num_input_fields; i++) { 1385 bool is_tensor = true; 1386 CeedEvalMode eval_mode_i; 1387 CeedBasis basis_i; 1388 1389 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 1390 if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 1391 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 1392 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 1393 for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 1394 CeedEvalMode eval_mode_j; 1395 CeedBasis basis_j; 1396 1397 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1398 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1399 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1400 if (basis_i == basis_j) { 1401 if (is_tensor) { 1402 input_matrix_reuse[i].index = j; 1403 input_matrix_reuse[i].is_input = true; 1404 input_matrix_reuse[i].eval_mode = eval_mode_j; 1405 } else { 1406 // For non-tensor can only re-use with the same eval mode 1407 if (eval_mode_i == eval_mode_j) { 1408 input_matrix_reuse[i].index = j; 1409 input_matrix_reuse[i].is_input = true; 1410 input_matrix_reuse[i].eval_mode = eval_mode_j; 1411 } 1412 } 1413 } 1414 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1415 } 1416 CeedCallBackend(CeedBasisDestroy(&basis_i)); 1417 } 1418 1419 // -- Determine output mat reuse 1420 FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 1421 1422 for (CeedInt i = 0; i < num_output_fields; i++) { 1423 output_matrix_reuse[i].index = -1; 1424 } 1425 for (CeedInt i = 0; i < num_output_fields; i++) { 1426 bool is_tensor = true; 1427 CeedEvalMode eval_mode_i; 1428 CeedBasis basis_i; 1429 1430 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 1431 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 1432 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 1433 CeedEvalMode eval_mode_j; 1434 CeedBasis basis_j; 1435 1436 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1437 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1438 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1439 if (basis_i == basis_j) { 1440 if (is_tensor) { 1441 output_matrix_reuse[i].index = j; 1442 output_matrix_reuse[i].is_input = true; 1443 output_matrix_reuse[i].eval_mode = eval_mode_j; 1444 } else { 1445 // For non-tensor can only re-use with the same eval mode 1446 if (eval_mode_i == eval_mode_j) { 1447 output_matrix_reuse[i].index = j; 1448 output_matrix_reuse[i].is_input = true; 1449 output_matrix_reuse[i].eval_mode = eval_mode_j; 1450 } 1451 } 1452 } 1453 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1454 } 1455 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 1456 CeedEvalMode eval_mode_j; 1457 CeedBasis basis_j; 1458 1459 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 1460 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1461 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 1462 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 1463 if (basis_i == basis_j) { 1464 if (is_tensor) { 1465 output_matrix_reuse[i].index = j; 1466 output_matrix_reuse[i].is_input = false; 1467 output_matrix_reuse[i].eval_mode = eval_mode_j; 1468 } else { 1469 // For non-tensor can only re-use with the same eval mode 1470 if (eval_mode_i == eval_mode_j) { 1471 output_matrix_reuse[i].index = j; 1472 output_matrix_reuse[i].is_input = false; 1473 output_matrix_reuse[i].eval_mode = eval_mode_j; 1474 } 1475 } 1476 } 1477 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1478 } 1479 CeedCallBackend(CeedBasisDestroy(&basis_i)); 1480 } 1481 1482 // Initialize constants, and matrices B and G 1483 code << "\n" << tab << "// Input field constants and basis data\n"; 1484 for (CeedInt i = 0; i < num_input_fields; i++) { 1485 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 1486 max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); 1487 } 1488 code << "\n" << tab << "// Output field constants and basis data\n"; 1489 for (CeedInt i = 0; i < num_output_fields; i++) { 1490 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 1491 max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); 1492 } 1493 1494 // Loop over all elements 1495 code << "\n" << tab << "// Element loop\n"; 1496 code << tab << "__syncthreads();\n"; 1497 code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 1498 tab.push(); 1499 1500 // -- Compute minimum buffer space needed 1501 CeedInt max_rstr_buffer_size = 1; 1502 1503 for (CeedInt i = 0; i < num_input_fields; i++) { 1504 CeedEvalMode eval_mode; 1505 1506 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1507 if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 1508 CeedInt num_comp; 1509 CeedElemRestriction elem_rstr; 1510 1511 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 1512 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1513 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1514 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1515 } 1516 } 1517 for (CeedInt i = 0; i < num_output_fields; i++) { 1518 CeedEvalMode eval_mode; 1519 1520 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1521 if (eval_mode != CEED_EVAL_NONE) { 1522 CeedInt num_comp; 1523 CeedElemRestriction elem_rstr; 1524 1525 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1526 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1527 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1528 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1529 } 1530 } 1531 code << tab << "// Scratch restriction buffer space\n"; 1532 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 1533 1534 // -- Determine best input field processing order 1535 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 1536 1537 for (CeedInt i = 0; i < num_input_fields; i++) { 1538 field_rstr_in_buffer[i] = -1; 1539 input_field_order[i] = -1; 1540 } 1541 { 1542 bool is_ordered[CEED_FIELD_MAX]; 1543 CeedInt curr_index = 0; 1544 1545 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 1546 for (CeedInt i = 0; i < num_input_fields; i++) { 1547 CeedVector vec_i; 1548 CeedElemRestriction rstr_i; 1549 1550 if (is_ordered[i]) continue; 1551 field_rstr_in_buffer[i] = i; 1552 is_ordered[i] = true; 1553 input_field_order[curr_index] = i; 1554 curr_index++; 1555 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 1556 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 1557 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 1558 for (CeedInt j = i + 1; j < num_input_fields; j++) { 1559 CeedVector vec_j; 1560 CeedElemRestriction rstr_j; 1561 1562 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 1563 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 1564 if (rstr_i == rstr_j && vec_i == vec_j) { 1565 field_rstr_in_buffer[j] = i; 1566 is_ordered[j] = true; 1567 input_field_order[curr_index] = j; 1568 curr_index++; 1569 } 1570 CeedCallBackend(CeedVectorDestroy(&vec_j)); 1571 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 1572 } 1573 CeedCallBackend(CeedVectorDestroy(&vec_i)); 1574 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 1575 } 1576 } 1577 1578 // -- Input restriction and basis 1579 code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 1580 for (CeedInt i = 0; i < num_input_fields; i++) { 1581 const char *field_name; 1582 const CeedInt f = input_field_order[i]; 1583 1584 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 1585 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 1586 1587 // ---- Restriction 1588 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 1589 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 1590 1591 // ---- Basis action 1592 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 1593 is_all_tensor, is_at_points, use_3d_slices)); 1594 } 1595 1596 // -- Q function 1597 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 1598 qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 1599 Q_1d, is_all_tensor, is_at_points, use_3d_slices)); 1600 1601 // -- Output basis and restriction 1602 code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 1603 for (CeedInt i = 0; i < num_output_fields; i++) { 1604 const char *field_name; 1605 1606 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1607 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1608 1609 // ---- Basis action 1610 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, 1611 is_all_tensor, is_at_points, use_3d_slices)); 1612 1613 // ---- Restriction 1614 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, 1615 false, is_all_tensor, is_at_points, use_3d_slices)); 1616 } 1617 1618 // Close loop and function 1619 tab.pop(); 1620 code << tab << "}\n"; 1621 tab.pop(); 1622 code << tab << "}\n"; 1623 code << tab << "// -----------------------------------------------------------------------------\n\n"; 1624 1625 CeedInt block_sizes[3] = {0, 0, 0}; 1626 CeedInt num_elem; 1627 1628 // Compile 1629 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 1630 CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes)); 1631 { 1632 bool is_compile_good = false; 1633 1634 data->thread_1d = block_sizes[0]; 1635 CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE", 1636 block_sizes[0] * block_sizes[1] * block_sizes[2])); 1637 if (is_compile_good) { 1638 *is_good_build = true; 1639 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op)); 1640 } else { 1641 *is_good_build = false; 1642 data->use_fallback = true; 1643 } 1644 } 1645 CeedCallBackend(CeedOperatorSetSetupDone(op)); 1646 CeedCallBackend(CeedDestroy(&ceed)); 1647 CeedCallBackend(CeedQFunctionDestroy(&qf)); 1648 return CEED_ERROR_SUCCESS; 1649 } 1650 1651 //------------------------------------------------------------------------------ 1652 // Build AtPoints assembly operator kernel 1653 //------------------------------------------------------------------------------ 1654 static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) { 1655 bool is_all_tensor = true, is_at_points = false, use_3d_slices = false; 1656 Ceed ceed; 1657 CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; 1658 CeedQFunctionField *qf_input_fields, *qf_output_fields; 1659 CeedQFunction_Hip_gen *qf_data; 1660 CeedQFunction qf; 1661 CeedOperatorField *op_input_fields, *op_output_fields; 1662 CeedOperator_Hip_gen *data; 1663 std::ostringstream code; 1664 Tab tab; 1665 1666 // Check compatibility 1667 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1668 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 1669 CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported"); 1670 1671 // Retrieve operator data 1672 CeedCallBackend(CeedOperatorGetData(op, &data)); 1673 Q = data->Q; 1674 Q_1d = data->Q_1d; 1675 max_dim = data->dim; 1676 { 1677 CeedElemRestriction rstr_points = NULL; 1678 1679 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 1680 CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 1681 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 1682 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 1683 } 1684 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1685 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 1686 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 1687 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 1688 1689 // Load basis source files 1690 code << tab << "// Tensor basis source\n"; 1691 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 1692 code << tab << "// AtPoints basis source\n"; 1693 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n"; 1694 code << tab << "// CodeGen operator source\n"; 1695 code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 1696 1697 // Get QFunction name 1698 std::string qfunction_name(qf_data->qfunction_name); 1699 std::string operator_name; 1700 1701 if (is_full) { 1702 operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name; 1703 } else { 1704 operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name; 1705 } 1706 1707 // Define CEED_Q_VLA 1708 code << "\n" << tab << "#undef CEED_Q_VLA\n"; 1709 code << tab << "#define CEED_Q_VLA 1\n\n"; 1710 1711 // Add user QFunction source 1712 { 1713 const char *source_path; 1714 1715 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 1716 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 1717 1718 code << tab << "// User QFunction source\n"; 1719 code << tab << "#include \"" << source_path << "\"\n\n"; 1720 } 1721 1722 // Setup 1723 code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 1724 code << tab << "// Operator Assembly Kernel\n"; 1725 code << tab << "// \n"; 1726 code << tab << "// d_[in,out]_i: CeedVector device array\n"; 1727 code << tab << "// r_[in,out]_e_i: Element vector register\n"; 1728 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 1729 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 1730 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 1731 code << tab << "// \n"; 1732 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 1733 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 1734 code << tab << "// -----------------------------------------------------------------------------\n"; 1735 code << tab << "extern \"C\" __global__ void " << operator_name 1736 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip " 1737 "points, CeedScalar *__restrict__ values_array) {\n"; 1738 tab.push(); 1739 1740 // Scratch buffers 1741 for (CeedInt i = 0; i < num_input_fields; i++) { 1742 CeedEvalMode eval_mode; 1743 1744 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1745 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 1746 code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 1747 } 1748 } 1749 for (CeedInt i = 0; i < num_output_fields; i++) { 1750 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 1751 } 1752 1753 code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 1754 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 1755 code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; 1756 code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 1757 1758 // Shared data 1759 code << tab << "extern __shared__ CeedScalar slice[];\n"; 1760 code << tab << "SharedData_Hip data;\n"; 1761 code << tab << "data.t_id_x = threadIdx.x;\n"; 1762 code << tab << "data.t_id_y = threadIdx.y;\n"; 1763 code << tab << "data.t_id_z = threadIdx.z;\n"; 1764 code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 1765 code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 1766 1767 // -- Determine input mat reuse 1768 FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 1769 1770 for (CeedInt i = 0; i < num_input_fields; i++) { 1771 input_matrix_reuse[i].index = -1; 1772 } 1773 for (CeedInt i = 0; i < num_input_fields; i++) { 1774 CeedEvalMode eval_mode_i; 1775 CeedBasis basis_i; 1776 1777 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 1778 if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 1779 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 1780 for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 1781 CeedEvalMode eval_mode_j; 1782 CeedBasis basis_j; 1783 1784 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1785 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1786 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1787 if (basis_i == basis_j) { 1788 input_matrix_reuse[i].index = j; 1789 input_matrix_reuse[i].is_input = true; 1790 input_matrix_reuse[i].eval_mode = eval_mode_j; 1791 } 1792 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1793 } 1794 CeedCallBackend(CeedBasisDestroy(&basis_i)); 1795 } 1796 1797 // -- Determine output mat reuse 1798 FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 1799 1800 for (CeedInt i = 0; i < num_output_fields; i++) { 1801 output_matrix_reuse[i].index = -1; 1802 } 1803 for (CeedInt i = 0; i < num_output_fields; i++) { 1804 CeedEvalMode eval_mode_i; 1805 CeedBasis basis_i; 1806 1807 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 1808 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 1809 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 1810 CeedEvalMode eval_mode_j; 1811 CeedBasis basis_j; 1812 1813 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1814 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1815 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1816 if (basis_i == basis_j) { 1817 output_matrix_reuse[i].index = j; 1818 output_matrix_reuse[i].is_input = true; 1819 output_matrix_reuse[i].eval_mode = eval_mode_j; 1820 } 1821 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1822 } 1823 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 1824 CeedEvalMode eval_mode_j; 1825 CeedBasis basis_j; 1826 1827 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 1828 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1829 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 1830 if (basis_i == basis_j) { 1831 output_matrix_reuse[i].index = j; 1832 output_matrix_reuse[i].is_input = false; 1833 output_matrix_reuse[i].eval_mode = eval_mode_j; 1834 } 1835 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1836 } 1837 CeedCallBackend(CeedBasisDestroy(&basis_i)); 1838 } 1839 1840 // Initialize constants, and matrices B and G 1841 code << "\n" << tab << "// Input field constants and basis data\n"; 1842 for (CeedInt i = 0; i < num_input_fields; i++) { 1843 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 1844 max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); 1845 } 1846 code << "\n" << tab << "// Output field constants and basis data\n"; 1847 for (CeedInt i = 0; i < num_output_fields; i++) { 1848 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 1849 max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); 1850 } 1851 1852 // Loop over all elements 1853 code << "\n" << tab << "// Element loop\n"; 1854 code << tab << "__syncthreads();\n"; 1855 code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 1856 tab.push(); 1857 1858 // -- Compute minimum buffer space needed 1859 CeedInt max_rstr_buffer_size = 1; 1860 1861 for (CeedInt i = 0; i < num_input_fields; i++) { 1862 CeedEvalMode eval_mode; 1863 1864 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1865 if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 1866 CeedInt num_comp; 1867 CeedElemRestriction elem_rstr; 1868 1869 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 1870 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1871 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1872 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1873 } 1874 } 1875 for (CeedInt i = 0; i < num_output_fields; i++) { 1876 CeedEvalMode eval_mode; 1877 1878 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1879 if (eval_mode != CEED_EVAL_NONE) { 1880 CeedInt num_comp; 1881 CeedElemRestriction elem_rstr; 1882 1883 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1884 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1885 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1886 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1887 } 1888 } 1889 code << tab << "// Scratch restriction buffer space\n"; 1890 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 1891 1892 // -- Determine best input field processing order 1893 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 1894 1895 for (CeedInt i = 0; i < num_input_fields; i++) { 1896 field_rstr_in_buffer[i] = -1; 1897 input_field_order[i] = -1; 1898 } 1899 { 1900 bool is_ordered[CEED_FIELD_MAX]; 1901 CeedInt curr_index = 0; 1902 1903 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 1904 for (CeedInt i = 0; i < num_input_fields; i++) { 1905 CeedVector vec_i; 1906 CeedElemRestriction rstr_i; 1907 1908 if (is_ordered[i]) continue; 1909 field_rstr_in_buffer[i] = i; 1910 is_ordered[i] = true; 1911 input_field_order[curr_index] = i; 1912 curr_index++; 1913 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 1914 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 1915 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 1916 for (CeedInt j = i + 1; j < num_input_fields; j++) { 1917 CeedVector vec_j; 1918 CeedElemRestriction rstr_j; 1919 1920 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 1921 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 1922 if (rstr_i == rstr_j && vec_i == vec_j) { 1923 field_rstr_in_buffer[j] = i; 1924 is_ordered[j] = true; 1925 input_field_order[curr_index] = j; 1926 curr_index++; 1927 } 1928 CeedCallBackend(CeedVectorDestroy(&vec_j)); 1929 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 1930 } 1931 CeedCallBackend(CeedVectorDestroy(&vec_i)); 1932 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 1933 } 1934 } 1935 1936 // -- Input restriction and basis 1937 code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 1938 CeedInt active_field_index = -1; 1939 1940 for (CeedInt i = 0; i < num_input_fields; i++) { 1941 bool is_active = false; 1942 const char *field_name; 1943 const CeedInt f = input_field_order[i]; 1944 1945 { 1946 CeedVector vec; 1947 1948 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 1949 is_active = vec == CEED_VECTOR_ACTIVE; 1950 CeedCallBackend(CeedVectorDestroy(&vec)); 1951 } 1952 1953 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 1954 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 1955 1956 if (is_active) { 1957 std::string var_suffix = "_in_" + std::to_string(f); 1958 1959 code << tab << "// Active field - no restriction or basis action here\n"; 1960 if (active_field_index == -1) { 1961 active_field_index = f; 1962 code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1") 1963 << "] = {0.0};\n"; 1964 } else { 1965 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n"; 1966 } 1967 } else { 1968 // ---- Restriction 1969 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 1970 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 1971 1972 // ---- Basis action 1973 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 1974 is_all_tensor, is_at_points, use_3d_slices)); 1975 } 1976 } 1977 1978 // -- Loop over active field 1979 std::string active_var_suffix = "_in_" + std::to_string(active_field_index); 1980 1981 code << "\n" << tab << "// Loop over nodes in active field\n"; 1982 code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix 1983 << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n"; 1984 tab.push(); 1985 1986 // -- Set current active node and component to 1 1987 code << tab << "// Set current active node and component to 1.0\n"; 1988 code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e" 1989 << active_var_suffix << ");\n\n"; 1990 1991 for (CeedInt i = 0; i < num_input_fields; i++) { 1992 bool is_active = false; 1993 const char *field_name; 1994 const CeedInt f = input_field_order[i]; 1995 1996 { 1997 CeedVector vec; 1998 1999 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 2000 is_active = vec == CEED_VECTOR_ACTIVE; 2001 CeedCallBackend(CeedVectorDestroy(&vec)); 2002 } 2003 if (!is_active) continue; 2004 2005 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 2006 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 2007 2008 // ---- Basis action 2009 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 2010 is_all_tensor, is_at_points, use_3d_slices)); 2011 } 2012 2013 // -- Q function 2014 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 2015 qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 2016 Q_1d, is_all_tensor, is_at_points, use_3d_slices)); 2017 2018 // -- Output basis and restriction 2019 code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 2020 for (CeedInt i = 0; i < num_output_fields; i++) { 2021 bool is_active = false; 2022 const char *field_name; 2023 2024 { 2025 CeedVector vec; 2026 2027 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 2028 is_active = vec == CEED_VECTOR_ACTIVE; 2029 CeedCallBackend(CeedVectorDestroy(&vec)); 2030 } 2031 if (!is_active) continue; 2032 2033 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 2034 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 2035 2036 // ---- Basis action 2037 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, 2038 is_all_tensor, is_at_points, use_3d_slices)); 2039 2040 // ---- Restriction 2041 if (is_full) { 2042 std::string var_suffix = "_out_" + std::to_string(i); 2043 CeedInt comp_stride; 2044 CeedSize l_size; 2045 CeedElemRestriction elem_rstr; 2046 2047 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 2048 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 2049 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 2050 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 2051 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 2052 code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix 2053 << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n"; 2054 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2055 } else { 2056 std::string var_suffix = "_out_" + std::to_string(i); 2057 CeedInt comp_stride; 2058 CeedSize l_size; 2059 CeedElemRestriction elem_rstr; 2060 2061 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 2062 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 2063 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 2064 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 2065 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 2066 code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix 2067 << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n"; 2068 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2069 } 2070 } 2071 2072 // -- Reset current active node and component 2073 code << "\n" << tab << "// Reset current active node and component to 0.0\n"; 2074 code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e" 2075 << active_var_suffix << ");\n"; 2076 2077 // -- End of loop over active field 2078 tab.pop(); 2079 code << tab << "}\n"; 2080 2081 // Close loop and function 2082 tab.pop(); 2083 code << tab << "}\n"; 2084 tab.pop(); 2085 code << tab << "}\n"; 2086 code << tab << "// -----------------------------------------------------------------------------\n\n"; 2087 2088 CeedInt block_sizes[3] = {0, 0, 0}; 2089 CeedInt num_elem; 2090 2091 // Compile 2092 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 2093 CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); 2094 { 2095 bool is_compile_good = false; 2096 2097 data->thread_1d = block_sizes[0]; 2098 CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, 2099 is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0], 2100 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2])); 2101 if (is_compile_good) { 2102 *is_good_build = true; 2103 CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(), 2104 is_full ? &data->assemble_full : &data->assemble_diagonal)); 2105 } else { 2106 *is_good_build = false; 2107 data->use_assembly_fallback = true; 2108 } 2109 } 2110 CeedCallBackend(CeedDestroy(&ceed)); 2111 CeedCallBackend(CeedQFunctionDestroy(&qf)); 2112 return CEED_ERROR_SUCCESS; 2113 } 2114 2115 extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) { 2116 return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build); 2117 } 2118 2119 extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) { 2120 return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build); 2121 } 2122 //------------------------------------------------------------------------------ 2123 // Build QFunction assembly operator kernel 2124 //------------------------------------------------------------------------------ 2125 extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) { 2126 bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; 2127 Ceed ceed; 2128 CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0; 2129 CeedQFunctionField *qf_input_fields, *qf_output_fields; 2130 CeedQFunction_Hip_gen *qf_data; 2131 CeedQFunction qf; 2132 CeedOperatorField *op_input_fields, *op_output_fields; 2133 CeedOperator_Hip_gen *data; 2134 std::ostringstream code; 2135 Tab tab; 2136 2137 // Check compatibility 2138 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 2139 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 2140 CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported"); 2141 2142 // Check field compatibility 2143 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 2144 { 2145 bool has_shared_bases = true; 2146 2147 for (CeedInt i = 0; i < num_input_fields; i++) { 2148 CeedBasis basis; 2149 2150 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 2151 if (basis != CEED_BASIS_NONE) { 2152 bool is_tensor = true; 2153 const char *resource; 2154 char *resource_root; 2155 Ceed basis_ceed; 2156 2157 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 2158 is_all_tensor = is_all_tensor && is_tensor; 2159 is_all_nontensor = is_all_nontensor && !is_tensor; 2160 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 2161 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 2162 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 2163 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 2164 CeedCallBackend(CeedFree(&resource_root)); 2165 CeedCallBackend(CeedDestroy(&basis_ceed)); 2166 } 2167 CeedCallBackend(CeedBasisDestroy(&basis)); 2168 } 2169 2170 for (CeedInt i = 0; i < num_output_fields; i++) { 2171 CeedBasis basis; 2172 2173 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 2174 if (basis != CEED_BASIS_NONE) { 2175 bool is_tensor = true; 2176 const char *resource; 2177 char *resource_root; 2178 Ceed basis_ceed; 2179 2180 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 2181 is_all_tensor = is_all_tensor && is_tensor; 2182 is_all_nontensor = is_all_nontensor && !is_tensor; 2183 2184 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 2185 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 2186 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 2187 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 2188 CeedCallBackend(CeedFree(&resource_root)); 2189 CeedCallBackend(CeedDestroy(&basis_ceed)); 2190 } 2191 CeedCallBackend(CeedBasisDestroy(&basis)); 2192 } 2193 } 2194 2195 // Retrieve operator data 2196 CeedCallBackend(CeedOperatorGetData(op, &data)); 2197 Q = data->Q; 2198 Q_1d = data->Q_1d; 2199 max_dim = data->dim; 2200 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 2201 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 2202 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 2203 2204 // Load basis source files 2205 if (!is_all_nontensor) { 2206 code << tab << "// Tensor basis source\n"; 2207 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 2208 } 2209 if (!is_all_tensor) { 2210 code << tab << "// Non-tensor basis source\n"; 2211 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n"; 2212 } 2213 if (!is_all_tensor && !is_all_nontensor) { 2214 code << "// Tensor basis source\n"; 2215 code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n"; 2216 } 2217 code << "// CodeGen operator source\n"; 2218 code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 2219 2220 // Get QFunction name 2221 std::string qfunction_name(qf_data->qfunction_name); 2222 std::string operator_name; 2223 2224 operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name; 2225 2226 // Define CEED_Q_VLA 2227 code << "\n" << tab << "#undef CEED_Q_VLA\n"; 2228 if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 2229 code << tab << "#define CEED_Q_VLA 1\n\n"; 2230 } else { 2231 code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 2232 } 2233 2234 // Add user QFunction source 2235 { 2236 const char *source_path; 2237 2238 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 2239 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 2240 2241 code << tab << "// User QFunction source\n"; 2242 code << tab << "#include \"" << source_path << "\"\n\n"; 2243 } 2244 2245 // Setup 2246 code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 2247 code << tab << "// Operator Assembly Kernel\n"; 2248 code << tab << "// \n"; 2249 code << tab << "// d_[in,out]_i: CeedVector device array\n"; 2250 code << tab << "// r_[in,out]_e_i: Element vector register\n"; 2251 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 2252 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 2253 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 2254 code << tab << "// \n"; 2255 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 2256 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 2257 code << tab << "// -----------------------------------------------------------------------------\n"; 2258 code << tab << "extern \"C\" __global__ void " << operator_name 2259 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip " 2260 "points, CeedScalar *__restrict__ values_array) {\n"; 2261 tab.push(); 2262 2263 // Scratch buffers 2264 for (CeedInt i = 0; i < num_input_fields; i++) { 2265 CeedEvalMode eval_mode; 2266 2267 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 2268 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 2269 code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 2270 } 2271 } 2272 for (CeedInt i = 0; i < num_output_fields; i++) { 2273 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 2274 } 2275 2276 code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 2277 if (!is_all_tensor) { 2278 code << tab << "const CeedInt Q = " << Q << ";\n"; 2279 } 2280 if (!is_all_nontensor) { 2281 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 2282 } 2283 2284 // Shared data 2285 code << tab << "extern __shared__ CeedScalar slice[];\n"; 2286 code << tab << "SharedData_Hip data;\n"; 2287 code << tab << "data.t_id_x = threadIdx.x;\n"; 2288 code << tab << "data.t_id_y = threadIdx.y;\n"; 2289 code << tab << "data.t_id_z = threadIdx.z;\n"; 2290 code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 2291 code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 2292 2293 // -- Determine input mat reuse 2294 FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 2295 2296 for (CeedInt i = 0; i < num_input_fields; i++) { 2297 input_matrix_reuse[i].index = -1; 2298 } 2299 for (CeedInt i = 0; i < num_input_fields; i++) { 2300 bool is_tensor = true; 2301 CeedEvalMode eval_mode_i; 2302 CeedBasis basis_i; 2303 2304 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 2305 if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 2306 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 2307 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 2308 for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 2309 CeedEvalMode eval_mode_j; 2310 CeedBasis basis_j; 2311 2312 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 2313 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 2314 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 2315 if (basis_i == basis_j) { 2316 if (is_tensor) { 2317 input_matrix_reuse[i].index = j; 2318 input_matrix_reuse[i].is_input = true; 2319 input_matrix_reuse[i].eval_mode = eval_mode_j; 2320 } else { 2321 // For non-tensor can only re-use with the same eval mode 2322 if (eval_mode_i == eval_mode_j) { 2323 input_matrix_reuse[i].index = j; 2324 input_matrix_reuse[i].is_input = true; 2325 input_matrix_reuse[i].eval_mode = eval_mode_j; 2326 } 2327 } 2328 } 2329 CeedCallBackend(CeedBasisDestroy(&basis_j)); 2330 } 2331 CeedCallBackend(CeedBasisDestroy(&basis_i)); 2332 } 2333 2334 // -- Determine output mat reuse 2335 FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 2336 2337 for (CeedInt i = 0; i < num_output_fields; i++) { 2338 output_matrix_reuse[i].index = -1; 2339 } 2340 for (CeedInt i = 0; i < num_output_fields; i++) { 2341 bool is_tensor = true; 2342 CeedEvalMode eval_mode_i; 2343 CeedBasis basis_i; 2344 2345 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 2346 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 2347 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 2348 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 2349 CeedEvalMode eval_mode_j; 2350 CeedBasis basis_j; 2351 2352 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 2353 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 2354 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 2355 if (basis_i == basis_j) { 2356 if (is_tensor) { 2357 output_matrix_reuse[i].index = j; 2358 output_matrix_reuse[i].is_input = true; 2359 output_matrix_reuse[i].eval_mode = eval_mode_j; 2360 } else { 2361 // For non-tensor can only re-use with the same eval mode 2362 if (eval_mode_i == eval_mode_j) { 2363 output_matrix_reuse[i].index = j; 2364 output_matrix_reuse[i].is_input = true; 2365 output_matrix_reuse[i].eval_mode = eval_mode_j; 2366 } 2367 } 2368 } 2369 CeedCallBackend(CeedBasisDestroy(&basis_j)); 2370 } 2371 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 2372 CeedEvalMode eval_mode_j; 2373 CeedBasis basis_j; 2374 2375 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 2376 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 2377 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 2378 if (basis_i == basis_j) { 2379 if (is_tensor) { 2380 output_matrix_reuse[i].index = j; 2381 output_matrix_reuse[i].is_input = false; 2382 output_matrix_reuse[i].eval_mode = eval_mode_j; 2383 } else { 2384 // For non-tensor can only re-use with the same eval mode 2385 if (eval_mode_i == eval_mode_j) { 2386 output_matrix_reuse[i].index = j; 2387 output_matrix_reuse[i].is_input = false; 2388 output_matrix_reuse[i].eval_mode = eval_mode_j; 2389 } 2390 } 2391 } 2392 CeedCallBackend(CeedBasisDestroy(&basis_j)); 2393 } 2394 CeedCallBackend(CeedBasisDestroy(&basis_i)); 2395 } 2396 2397 // Initialize constants, and matrices B and G 2398 code << "\n" << tab << "// Input field constants and basis data\n"; 2399 for (CeedInt i = 0; i < num_input_fields; i++) { 2400 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 2401 max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true)); 2402 } 2403 code << "\n" << tab << "// Output field constants and basis data\n"; 2404 for (CeedInt i = 0; i < num_output_fields; i++) { 2405 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 2406 max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true)); 2407 } 2408 2409 // Loop over all elements 2410 code << "\n" << tab << "// Element loop\n"; 2411 code << tab << "__syncthreads();\n"; 2412 code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 2413 tab.push(); 2414 2415 // -- Compute minimum buffer space needed 2416 CeedInt max_rstr_buffer_size = 1; 2417 2418 for (CeedInt i = 0; i < num_input_fields; i++) { 2419 CeedEvalMode eval_mode; 2420 2421 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 2422 if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 2423 CeedInt num_comp; 2424 CeedElemRestriction elem_rstr; 2425 2426 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 2427 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 2428 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 2429 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2430 } 2431 } 2432 for (CeedInt i = 0; i < num_output_fields; i++) { 2433 CeedEvalMode eval_mode; 2434 2435 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 2436 if (eval_mode != CEED_EVAL_NONE) { 2437 CeedInt num_comp; 2438 CeedElemRestriction elem_rstr; 2439 2440 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 2441 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 2442 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 2443 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2444 } 2445 } 2446 code << tab << "// Scratch restriction buffer space\n"; 2447 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 2448 2449 // -- Determine best input field processing order 2450 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 2451 2452 for (CeedInt i = 0; i < num_input_fields; i++) { 2453 field_rstr_in_buffer[i] = -1; 2454 input_field_order[i] = -1; 2455 } 2456 { 2457 bool is_ordered[CEED_FIELD_MAX]; 2458 CeedInt curr_index = 0; 2459 2460 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 2461 for (CeedInt i = 0; i < num_input_fields; i++) { 2462 CeedVector vec_i; 2463 CeedElemRestriction rstr_i; 2464 2465 if (is_ordered[i]) continue; 2466 field_rstr_in_buffer[i] = i; 2467 is_ordered[i] = true; 2468 input_field_order[curr_index] = i; 2469 curr_index++; 2470 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 2471 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 2472 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 2473 for (CeedInt j = i + 1; j < num_input_fields; j++) { 2474 CeedVector vec_j; 2475 CeedElemRestriction rstr_j; 2476 2477 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 2478 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 2479 if (rstr_i == rstr_j && vec_i == vec_j) { 2480 field_rstr_in_buffer[j] = i; 2481 is_ordered[j] = true; 2482 input_field_order[curr_index] = j; 2483 curr_index++; 2484 } 2485 CeedCallBackend(CeedVectorDestroy(&vec_j)); 2486 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 2487 } 2488 CeedCallBackend(CeedVectorDestroy(&vec_i)); 2489 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 2490 } 2491 } 2492 2493 // -- Input restriction and basis 2494 code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 2495 CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0; 2496 CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX]; 2497 2498 for (CeedInt i = 0; i < num_input_fields; i++) { 2499 bool is_active = false; 2500 const char *field_name; 2501 const CeedInt f = input_field_order[i]; 2502 2503 { 2504 CeedVector vec; 2505 2506 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 2507 is_active = vec == CEED_VECTOR_ACTIVE; 2508 CeedCallBackend(CeedVectorDestroy(&vec)); 2509 } 2510 2511 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 2512 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 2513 2514 if (is_active) { 2515 CeedEvalMode eval_mode; 2516 CeedInt field_size; 2517 2518 active_fields_in[num_active_in] = f; 2519 num_active_in++; 2520 CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size)); 2521 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode)); 2522 if (eval_mode == CEED_EVAL_GRAD) { 2523 code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*" 2524 << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; 2525 } else { 2526 code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; 2527 } 2528 code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n"; 2529 } else { 2530 // ---- Restriction 2531 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 2532 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 2533 2534 // ---- Basis action 2535 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 2536 is_all_tensor, is_at_points, use_3d_slices)); 2537 } 2538 } 2539 code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {"; 2540 for (CeedInt i = 0; i < num_active_in; i++) { 2541 code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); 2542 } 2543 code << "};\n"; 2544 code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {"; 2545 for (CeedInt i = 0; i < num_active_in; i++) { 2546 code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); 2547 } 2548 code << "};\n"; 2549 2550 for (CeedInt i = 0; i < num_output_fields; i++) { 2551 bool is_active = false; 2552 2553 { 2554 CeedVector vec; 2555 2556 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 2557 is_active = vec == CEED_VECTOR_ACTIVE; 2558 CeedCallBackend(CeedVectorDestroy(&vec)); 2559 } 2560 if (is_active) { 2561 const char *field_name; 2562 CeedInt field_size; 2563 2564 active_fields_out[num_active_out] = i; 2565 num_active_out++; 2566 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); 2567 qf_assembly_size_out += field_size; 2568 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 2569 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 2570 code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n"; 2571 } 2572 } 2573 code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {"; 2574 for (CeedInt i = 0; i < num_active_out; i++) { 2575 code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : ""); 2576 } 2577 code << "};\n"; 2578 code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n"; 2579 2580 // -- Loop over active field 2581 code << "\n" << tab << "CeedInt input_offset = 0;\n"; 2582 code << tab << "// Loop over active QFunction input fields\n"; 2583 code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n"; 2584 code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n"; 2585 tab.push(); 2586 2587 // -- Loop over size of active field 2588 code << "\n" << tab << "// Loop over current active input field size\n"; 2589 code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n"; 2590 code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n"; 2591 tab.push(); 2592 2593 // -- Set current active point and component to 1 2594 code << tab << "// Set current active point and component to 1.0\n"; 2595 if (is_all_tensor && (max_dim >= 3)) { 2596 code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n"; 2597 } else { 2598 code << tab << "r_q_in[a][s] = 1.0;\n"; 2599 } 2600 2601 // -- Q function 2602 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 2603 qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 2604 Q_1d, is_all_tensor, is_at_points, use_3d_slices)); 2605 2606 // -- Output basis and restriction 2607 code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 2608 CeedScalar offset = 0; 2609 2610 for (CeedInt i = 0; i < num_output_fields; i++) { 2611 bool is_active = false; 2612 const char *field_name; 2613 2614 { 2615 CeedVector vec; 2616 2617 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 2618 is_active = vec == CEED_VECTOR_ACTIVE; 2619 CeedCallBackend(CeedVectorDestroy(&vec)); 2620 } 2621 if (!is_active) continue; 2622 2623 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 2624 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 2625 2626 // ---- Restriction 2627 CeedInt field_size; 2628 2629 code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", " 2630 << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n"; 2631 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); 2632 offset += field_size; 2633 } 2634 2635 // -- Reset current active node and component 2636 code << "\n" << tab << "// Reset current active node and component to 0.0\n"; 2637 if (is_all_tensor && (max_dim >= 3)) { 2638 code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n"; 2639 } else { 2640 code << tab << "r_q_in[a][s] = 0.0;\n"; 2641 } 2642 2643 // -- End of loop over size of active field 2644 tab.pop(); 2645 code << tab << "}\n"; 2646 code << tab << "input_offset += field_size_in;\n"; 2647 2648 // -- End of loop over active field 2649 tab.pop(); 2650 code << tab << "}\n"; 2651 2652 // Close loop and function 2653 tab.pop(); 2654 code << tab << "}\n"; 2655 tab.pop(); 2656 code << tab << "}\n"; 2657 code << tab << "// -----------------------------------------------------------------------------\n\n"; 2658 2659 CeedInt block_sizes[3] = {0, 0, 0}; 2660 CeedInt num_elem; 2661 2662 // Compile 2663 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 2664 CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); 2665 { 2666 bool is_compile_good = false; 2667 2668 data->thread_1d = block_sizes[0]; 2669 CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0], 2670 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2])); 2671 if (is_compile_good) { 2672 *is_good_build = true; 2673 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction)); 2674 } else { 2675 *is_good_build = false; 2676 data->use_assembly_fallback = true; 2677 } 2678 } 2679 CeedCallBackend(CeedDestroy(&ceed)); 2680 CeedCallBackend(CeedQFunctionDestroy(&qf)); 2681 return CEED_ERROR_SUCCESS; 2682 } 2683 2684 //------------------------------------------------------------------------------ 2685