1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #define CEED_DEBUG_COLOR 12 9 10 #include <ceed.h> 11 #include <ceed/backend.h> 12 #include <ceed/gen-tools.h> 13 #include <ceed/jit-tools.h> 14 15 #include <iostream> 16 #include <sstream> 17 #include <string> 18 19 #include "../hip-ref/ceed-hip-ref.h" 20 #include "../hip-shared/ceed-hip-shared.h" 21 #include "../hip/ceed-hip-common.h" 22 #include "../hip/ceed-hip-compile.h" 23 #include "ceed-hip-gen.h" 24 25 struct FieldReuse_Hip { 26 CeedInt index; 27 bool is_input; 28 CeedEvalMode eval_mode; 29 }; 30 31 //------------------------------------------------------------------------------ 32 // Calculate the block size used for launching the operator kernel 33 //------------------------------------------------------------------------------ 34 extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { 35 const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 36 if (dim == 1) { 37 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 38 39 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 40 block_sizes[0] = thread_1d; 41 block_sizes[1] = 1; 42 block_sizes[2] = elems_per_block; 43 } else if (dim == 2) { 44 const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2; 45 46 block_sizes[0] = thread_1d; 47 block_sizes[1] = thread_1d; 48 block_sizes[2] = elems_per_block; 49 } else if (dim == 3) { 50 const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1); 51 52 block_sizes[0] = thread_1d; 53 block_sizes[1] = thread_1d; 54 block_sizes[2] = elems_per_block; 55 } 56 return CEED_ERROR_SUCCESS; 57 } 58 59 //------------------------------------------------------------------------------ 60 // Determine type of operator 61 //------------------------------------------------------------------------------ 62 static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 63 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields, 64 CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d, 65 CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) { 66 // Check if all are tensor 67 *is_all_tensor = true; 68 for (CeedInt i = 0; i < num_input_fields; i++) { 69 CeedBasis basis; 70 71 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 72 if (basis != CEED_BASIS_NONE) { 73 bool is_field_tensor; 74 75 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 76 *is_all_tensor = *is_all_tensor && is_field_tensor; 77 } 78 CeedCallBackend(CeedBasisDestroy(&basis)); 79 } 80 for (CeedInt i = 0; i < num_output_fields; i++) { 81 CeedBasis basis; 82 83 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 84 if (basis != CEED_BASIS_NONE) { 85 bool is_field_tensor; 86 87 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 88 *is_all_tensor = *is_all_tensor && is_field_tensor; 89 } 90 CeedCallBackend(CeedBasisDestroy(&basis)); 91 } 92 93 // Find max_P, max_P_1d, Q, and Q_1d 94 bool is_all_3d = true; 95 96 *max_P = 0; 97 *max_P_1d = 0; 98 *Q = 0; 99 *Q_1d = 0; 100 for (CeedInt i = 0; i < num_input_fields; i++) { 101 CeedBasis basis; 102 103 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 104 if (basis != CEED_BASIS_NONE) { 105 bool is_field_tensor; 106 CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; 107 108 // Check if 3D 109 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 110 is_all_3d = is_all_3d && (field_dim == 3); 111 *max_dim = CeedIntMax(*max_dim, field_dim); 112 113 // Collect P, P_1d, Q, and Q_1d 114 CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); 115 *max_P = CeedIntMax(*max_P, field_P); 116 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 117 if (is_field_tensor) { 118 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 119 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 120 } 121 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); 122 CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 123 *Q = field_Q; 124 if (is_field_tensor) { 125 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 126 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 127 *Q_1d = field_Q_1d; 128 } 129 } 130 CeedCallBackend(CeedBasisDestroy(&basis)); 131 } 132 for (CeedInt i = 0; i < num_output_fields; i++) { 133 CeedBasis basis; 134 135 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 136 if (basis != CEED_BASIS_NONE) { 137 bool is_field_tensor; 138 CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; 139 140 // Check if 3D 141 CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 142 is_all_3d = is_all_3d && (field_dim == 3); 143 *max_dim = CeedIntMax(*max_dim, field_dim); 144 145 // Collect P, P_1d, Q, and Q_1d 146 CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); 147 *max_P = CeedIntMax(*max_P, field_P); 148 CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 149 if (is_field_tensor) { 150 CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 151 *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 152 } 153 CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); 154 CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 155 *Q = field_Q; 156 if (is_field_tensor) { 157 CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 158 CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 159 *Q_1d = field_Q_1d; 160 } 161 } 162 CeedCallBackend(CeedBasisDestroy(&basis)); 163 } 164 165 // Only use 3D collocated gradient parallelization strategy when gradient is computed 166 *use_3d_slices = false; 167 if (is_all_3d && *is_all_tensor) { 168 bool was_grad_found = false; 169 170 for (CeedInt i = 0; i < num_input_fields; i++) { 171 CeedEvalMode eval_mode; 172 173 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 174 if (eval_mode == CEED_EVAL_GRAD) { 175 CeedBasis_Hip_shared *basis_data; 176 CeedBasis basis; 177 178 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 179 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 180 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 181 was_grad_found = true; 182 CeedCallBackend(CeedBasisDestroy(&basis)); 183 } 184 } 185 for (CeedInt i = 0; i < num_output_fields; i++) { 186 CeedEvalMode eval_mode; 187 188 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 189 if (eval_mode == CEED_EVAL_GRAD) { 190 CeedBasis_Hip_shared *basis_data; 191 CeedBasis basis; 192 193 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 194 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 195 *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 196 was_grad_found = true; 197 CeedCallBackend(CeedBasisDestroy(&basis)); 198 } 199 } 200 } 201 return CEED_ERROR_SUCCESS; 202 } 203 204 //------------------------------------------------------------------------------ 205 // Setup fields 206 //------------------------------------------------------------------------------ 207 static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, 208 CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse, 209 CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, 210 bool use_3d_slices, bool skip_active_load) { 211 bool is_tensor = true, is_active = true; 212 CeedBasis basis; 213 214 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 215 if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 216 { 217 CeedVector vec; 218 219 CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec)); 220 is_active = vec == CEED_VECTOR_ACTIVE; 221 CeedCallBackend(CeedVectorDestroy(&vec)); 222 } 223 224 const char *field_name; 225 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 226 std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 227 std::string option_name = (is_input ? "inputs" : "outputs"); 228 CeedEvalMode eval_mode = CEED_EVAL_NONE; 229 CeedInt elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0; 230 CeedElemRestriction elem_rstr; 231 CeedBasis_Hip_shared *basis_data; 232 233 // Field reuse info 234 bool use_previous_field = field_reuse.index != -1; 235 236 CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name)); 237 code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n"; 238 239 // Get field data 240 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 241 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 242 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 243 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 244 } 245 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 246 if (basis != CEED_BASIS_NONE) { 247 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 248 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 249 if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 250 else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 251 } 252 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 253 254 // Set field constants 255 code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n"; 256 if (is_tensor && !is_all_tensor) { 257 CeedInt P = 0; 258 259 CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 260 code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n"; 261 } 262 code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n"; 263 if (eval_mode != CEED_EVAL_WEIGHT) { 264 code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; 265 } 266 267 // Load basis data 268 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 269 switch (eval_mode) { 270 case CEED_EVAL_NONE: 271 break; 272 case CEED_EVAL_INTERP: 273 if (is_at_points) { 274 // AtPoints 275 if (!basis_data->d_chebyshev_interp_1d) { 276 CeedSize interp_bytes; 277 CeedScalar *chebyshev_interp_1d; 278 279 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 280 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 281 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 282 CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 283 CeedCallHip(CeedBasisReturnCeed(basis), 284 hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 285 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 286 } 287 if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 288 else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 289 } else { 290 // Standard quadrature 291 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 292 else data->B.outputs[i] = basis_data->d_interp_1d; 293 } 294 if (use_previous_field && !skip_active_load) { 295 std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 296 297 code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; 298 } else { 299 bool is_collocated = false; 300 301 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 302 if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { 303 code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; 304 } else { 305 code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 306 code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 307 } 308 } 309 break; 310 case CEED_EVAL_GRAD: 311 if (is_at_points) { 312 // AtPoints 313 if (!basis_data->d_chebyshev_interp_1d) { 314 CeedSize interp_bytes; 315 CeedScalar *chebyshev_interp_1d; 316 317 interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 318 CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 319 CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 320 CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 321 CeedCallHip(CeedBasisReturnCeed(basis), 322 hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 323 CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 324 } 325 if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 326 else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 327 } else { 328 // Standard quadrature 329 if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 330 else data->B.outputs[i] = basis_data->d_interp_1d; 331 } 332 if (is_tensor) { 333 if (use_previous_field && !skip_active_load) { 334 std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 335 336 code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; 337 } else { 338 bool is_collocated = false; 339 340 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 341 if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { 342 code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; 343 } else { 344 code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 345 code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 346 } 347 } 348 } 349 if (is_at_points) break; // No G mat for AtPoints 350 if (use_3d_slices) { 351 if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d; 352 else data->G.outputs[i] = basis_data->d_collo_grad_1d; 353 if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 354 std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 355 356 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 357 } else if (is_active && skip_active_load) { 358 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 359 } else { 360 code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 361 code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 362 } 363 } else { 364 bool has_collo_grad = basis_data->d_collo_grad_1d; 365 366 if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 367 else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 368 if (has_collo_grad) { 369 if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 370 std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 371 372 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 373 } else if (is_active && skip_active_load) { 374 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 375 } else { 376 code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 377 code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 378 } 379 } else { 380 if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 381 std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 382 383 code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 384 } else if (is_active && skip_active_load) { 385 code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 386 } else { 387 code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") 388 << (is_tensor ? "" : var_suffix) << "];\n"; 389 code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G." 390 << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 391 } 392 } 393 } 394 break; 395 case CEED_EVAL_WEIGHT: 396 break; // No action 397 // LCOV_EXCL_START 398 case CEED_EVAL_DIV: 399 case CEED_EVAL_CURL: 400 break; // TODO: Not implemented 401 // LCOV_EXCL_STOP 402 } 403 CeedCallBackend(CeedBasisDestroy(&basis)); 404 return CEED_ERROR_SUCCESS; 405 } 406 407 //------------------------------------------------------------------------------ 408 // Restriction 409 //------------------------------------------------------------------------------ 410 static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, 411 CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, 412 CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, 413 bool use_3d_slices) { 414 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 415 std::string P_name = (is_all_tensor ? "P_1d" : "P") + var_suffix; 416 CeedEvalMode eval_mode = CEED_EVAL_NONE; 417 CeedInt elem_size = 0, num_comp = 0; 418 CeedSize l_size; 419 CeedRestrictionType rstr_type = CEED_RESTRICTION_STANDARD; 420 CeedElemRestriction_Hip *rstr_data; 421 CeedElemRestriction elem_rstr; 422 423 // Get field data 424 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 425 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 426 CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); 427 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 428 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 429 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 430 } 431 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 432 433 // Restriction 434 if (is_input) { 435 // Input 436 if (field_input_buffer[i] != i) { 437 std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]); 438 439 // Restriction was already done for previous input 440 code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n"; 441 } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) { 442 if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) { 443 // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated 444 code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n"; 445 } else if (rstr_type != CEED_RESTRICTION_POINTS) { 446 // Otherwise we're using the scratch space 447 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 448 } 449 switch (rstr_type) { 450 case CEED_RESTRICTION_STANDARD: { 451 CeedInt comp_stride; 452 453 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 454 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 455 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 456 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 457 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 458 code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " 459 << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix 460 << ");\n"; 461 break; 462 } 463 case CEED_RESTRICTION_STRIDED: { 464 bool has_backend_strides; 465 CeedInt num_elem; 466 467 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 468 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 469 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 470 471 if (!has_backend_strides) { 472 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 473 } 474 code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 475 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 476 code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides" 477 << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e" 478 << var_suffix << ");\n"; 479 break; 480 } 481 case CEED_RESTRICTION_POINTS: { 482 CeedInt comp_stride; 483 484 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 485 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 486 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 487 break; 488 } 489 // LCOV_EXCL_START 490 case CEED_RESTRICTION_ORIENTED: 491 case CEED_RESTRICTION_CURL_ORIENTED: 492 break; // TODO: Not implemented 493 // LCOV_EXCL_STOP 494 } 495 } 496 } else { 497 // Output 498 switch (rstr_type) { 499 case CEED_RESTRICTION_STANDARD: { 500 CeedInt comp_stride; 501 502 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 503 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 504 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 505 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 506 data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 507 code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " 508 << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix 509 << ");\n"; 510 break; 511 } 512 case CEED_RESTRICTION_STRIDED: { 513 bool has_backend_strides; 514 CeedInt num_elem; 515 516 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 517 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 518 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 519 520 if (!has_backend_strides) { 521 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 522 } 523 code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 524 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 525 code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides" 526 << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix 527 << ");\n"; 528 break; 529 } 530 case CEED_RESTRICTION_POINTS: 531 data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 532 break; 533 // LCOV_EXCL_START 534 case CEED_RESTRICTION_ORIENTED: 535 case CEED_RESTRICTION_CURL_ORIENTED: 536 break; // TODO: Not implemented 537 // LCOV_EXCL_STOP 538 } 539 } 540 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 541 return CEED_ERROR_SUCCESS; 542 } 543 544 //------------------------------------------------------------------------------ 545 // Basis 546 //------------------------------------------------------------------------------ 547 static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field, 548 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, 549 bool is_at_points, bool use_3d_slices) { 550 bool is_tensor = true, is_collocated = true; 551 CeedBasis basis; 552 CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 553 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 554 CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 555 556 std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 557 std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 558 CeedEvalMode eval_mode = CEED_EVAL_NONE; 559 CeedInt dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0; 560 CeedElemRestriction elem_rstr; 561 562 // Get field data 563 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 564 if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 565 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 566 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 567 } 568 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 569 if (basis != CEED_BASIS_NONE) { 570 CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 571 if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 572 else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 573 } 574 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 575 576 // Basis 577 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 578 if (is_input) { 579 switch (eval_mode) { 580 case CEED_EVAL_NONE: 581 if (!use_3d_slices && !is_at_points) { 582 code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n"; 583 } 584 break; 585 case CEED_EVAL_INTERP: 586 if (is_at_points) { 587 std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 588 589 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 590 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 591 << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 592 } else { 593 std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 594 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 595 : "InterpNonTensor"; 596 std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 597 598 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 599 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e" 600 << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 601 } 602 break; 603 case CEED_EVAL_GRAD: 604 if (is_at_points) { 605 std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 606 607 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 608 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 609 << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 610 } else if (use_3d_slices) { 611 std::string function_name = 612 (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d"; 613 614 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 615 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 616 << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 617 } else if (is_tensor) { 618 bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; 619 std::string function_name = 620 (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + 621 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); 622 std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 623 624 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" 625 << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n"; 626 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e" 627 << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 628 } else { 629 std::string function_name = "GradNonTensor"; 630 631 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 632 code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name 633 << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 634 } 635 break; 636 case CEED_EVAL_WEIGHT: { 637 if (is_at_points) { 638 code << tab << "// Nothing to do AtPoints\n"; 639 } else { 640 CeedBasis_Hip_shared *basis_data; 641 std::string function_name = is_tensor 642 ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 643 : "WeightNonTensor"; 644 645 code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 646 CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 647 data->W = basis_data->d_q_weight_1d; 648 code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n"; 649 } 650 break; 651 } 652 // LCOV_EXCL_START 653 case CEED_EVAL_DIV: 654 case CEED_EVAL_CURL: 655 break; // TODO: Not implemented 656 // LCOV_EXCL_STOP 657 } 658 } else { 659 switch (eval_mode) { 660 case CEED_EVAL_NONE: 661 code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n"; 662 break; // No action 663 case CEED_EVAL_INTERP: 664 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 665 if (is_at_points) { 666 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 667 668 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix 669 << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 670 } else { 671 std::string function_name = 672 is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 673 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 674 : "InterpTransposeNonTensor"; 675 std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 676 677 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q" 678 << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 679 } 680 break; 681 case CEED_EVAL_GRAD: 682 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 683 if (is_at_points) { 684 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 685 686 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix 687 << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 688 } else if (use_3d_slices) { 689 std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 690 std::to_string(dim) + "d"; 691 692 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix 693 << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 694 } else if (is_tensor) { 695 bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; 696 std::string function_name = 697 (dim == 1 ? "GradTranspose" 698 : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + 699 std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); 700 std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 701 702 code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q" 703 << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 704 } else { 705 std::string function_name = "GradTransposeNonTensor"; 706 707 code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name 708 << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 709 } 710 break; 711 // LCOV_EXCL_START 712 case CEED_EVAL_WEIGHT: 713 break; // Should not occur 714 case CEED_EVAL_DIV: 715 case CEED_EVAL_CURL: 716 break; // TODO: Not implemented 717 // LCOV_EXCL_STOP 718 } 719 } 720 CeedCallBackend(CeedBasisDestroy(&basis)); 721 return CEED_ERROR_SUCCESS; 722 } 723 724 //------------------------------------------------------------------------------ 725 // QFunction 726 //------------------------------------------------------------------------------ 727 static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim, 728 CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 729 CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, 730 CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields, 731 std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points, 732 bool use_3d_slices) { 733 std::string Q_name = is_all_tensor ? "Q_1d" : "Q"; 734 CeedEvalMode eval_mode = CEED_EVAL_NONE; 735 CeedElemRestriction elem_rstr; 736 737 // Setup output arrays 738 code << "\n"; 739 code << tab << "// -- Output field setup\n"; 740 for (CeedInt i = 0; i < num_output_fields; i++) { 741 const char *field_name; 742 std::string var_suffix = "_out_" + std::to_string(i); 743 744 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 745 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 746 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 747 switch (eval_mode) { 748 case CEED_EVAL_NONE: 749 if (is_at_points) { 750 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n"; 751 } else { 752 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") 753 << "];\n"; 754 } 755 break; 756 case CEED_EVAL_INTERP: 757 if (is_at_points) { 758 // Accumulator for point data 759 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; 760 code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix 761 << "[i] = 0.0;\n"; 762 } else { 763 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") 764 << "];\n"; 765 } 766 break; 767 case CEED_EVAL_GRAD: 768 if (is_at_points) { 769 // Accumulator for point data 770 code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; 771 code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix 772 << "[i] = 0.0;\n"; 773 } else if (use_3d_slices) { 774 // Accumulator for gradient slices 775 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 776 code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n"; 777 } else { 778 code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" 779 << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n"; 780 } 781 break; 782 case CEED_EVAL_WEIGHT: 783 break; 784 // LCOV_EXCL_START 785 case CEED_EVAL_DIV: 786 case CEED_EVAL_CURL: 787 break; // TODO: Not implemented 788 // LCOV_EXCL_STOP 789 } 790 } 791 792 if (is_at_points) { 793 // We need to handle batches of points 794 code << "\n"; 795 code << tab << "// Note: Using batches of points\n"; 796 code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n"; 797 code << tab << "#pragma unroll\n"; 798 code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n"; 799 tab.push(); 800 code << tab << "const CeedInt p = i % max_num_points;\n\n"; 801 802 code << tab << "// -- Coordinates\n"; 803 code << tab << "CeedScalar r_x[max_dim];\n"; 804 code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n"; 805 806 code << tab << "// -- Input fields\n"; 807 for (CeedInt i = 0; i < num_input_fields; i++) { 808 const char *field_name; 809 std::string var_suffix = "_in_" + std::to_string(i); 810 std::string P_name = "P_1d" + var_suffix; 811 812 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 813 code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 814 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 815 // Basis action 816 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 817 switch (eval_mode) { 818 case CEED_EVAL_NONE: 819 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 820 code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 821 << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 822 break; 823 case CEED_EVAL_INTERP: 824 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 825 code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 826 << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; 827 break; 828 case CEED_EVAL_GRAD: 829 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 830 code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 831 << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; 832 break; 833 case CEED_EVAL_WEIGHT: 834 code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; 835 code << tab << "r_s" << var_suffix << "[0] = 1.0;\n"; 836 break; 837 // LCOV_EXCL_START 838 case CEED_EVAL_DIV: 839 case CEED_EVAL_CURL: 840 break; // TODO: Not implemented 841 // LCOV_EXCL_STOP 842 } 843 } 844 code << "\n"; 845 code << tab << "// -- Output fields\n"; 846 for (CeedInt i = 0; i < num_output_fields; i++) { 847 const char *field_name; 848 std::string var_suffix = "_out_" + std::to_string(i); 849 850 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 851 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 852 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 853 // Basis action 854 switch (eval_mode) { 855 case CEED_EVAL_NONE: 856 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 857 break; 858 case CEED_EVAL_INTERP: 859 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 860 break; 861 case CEED_EVAL_GRAD: 862 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 863 break; 864 // LCOV_EXCL_START 865 case CEED_EVAL_WEIGHT: 866 break; // Should not occur 867 case CEED_EVAL_DIV: 868 case CEED_EVAL_CURL: 869 break; // TODO: Not implemented 870 // LCOV_EXCL_STOP 871 } 872 } 873 874 } else if (use_3d_slices) { 875 // We treat quadrature points per slice in 3d to save registers 876 code << "\n"; 877 code << tab << "// Note: Using planes of 3D elements\n"; 878 code << tab << "#pragma unroll\n"; 879 code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n"; 880 tab.push(); 881 code << tab << "// -- Input fields\n"; 882 for (CeedInt i = 0; i < num_input_fields; i++) { 883 const char *field_name; 884 std::string var_suffix = "_in_" + std::to_string(i); 885 886 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 887 code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 888 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 889 // Basis action 890 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 891 switch (eval_mode) { 892 case CEED_EVAL_NONE: 893 bool is_strided; 894 895 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 896 897 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 898 CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 899 if (is_strided) { 900 bool has_backend_strides; 901 CeedInt num_elem, elem_size; 902 903 CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 904 CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 905 CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 906 CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 907 908 if (!has_backend_strides) { 909 CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 910 } 911 code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 912 << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 913 code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides" 914 << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n"; 915 } else { 916 CeedSize l_size = 0; 917 CeedInt comp_stride; 918 CeedElemRestriction_Hip *rstr_data; 919 920 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 921 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 922 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 923 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 924 CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 925 data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 926 code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size" 927 << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 928 } 929 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 930 break; 931 case CEED_EVAL_INTERP: 932 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 933 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n"; 934 tab.push(); 935 code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n"; 936 tab.pop(); 937 code << tab << "}\n"; 938 break; 939 case CEED_EVAL_GRAD: 940 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 941 code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G" 942 << var_suffix << ", r_s" << var_suffix << ");\n"; 943 break; 944 case CEED_EVAL_WEIGHT: 945 code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; 946 code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n"; 947 break; 948 // LCOV_EXCL_START 949 case CEED_EVAL_DIV: 950 case CEED_EVAL_CURL: 951 break; // TODO: Not implemented 952 // LCOV_EXCL_STOP 953 } 954 } 955 code << "\n"; 956 code << tab << "// -- Output fields\n"; 957 for (CeedInt i = 0; i < num_output_fields; i++) { 958 const char *field_name; 959 std::string var_suffix = "_out_" + std::to_string(i); 960 961 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 962 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 963 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 964 // Basis action 965 switch (eval_mode) { 966 case CEED_EVAL_NONE: 967 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 968 break; 969 case CEED_EVAL_INTERP: 970 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 971 break; 972 case CEED_EVAL_GRAD: 973 code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 974 break; 975 // LCOV_EXCL_START 976 case CEED_EVAL_WEIGHT: 977 break; // Should not occur 978 case CEED_EVAL_DIV: 979 case CEED_EVAL_CURL: 980 break; // TODO: Not implemented 981 // LCOV_EXCL_STOP 982 } 983 } 984 } else { 985 code << "\n"; 986 code << tab << "// Note: Using full elements\n"; 987 code << tab << "{\n"; 988 tab.push(); 989 code << tab << "// -- Input fields\n"; 990 for (CeedInt i = 0; i < num_input_fields; i++) { 991 const char *field_name; 992 993 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 994 code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 995 code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n"; 996 } 997 code << tab << "// -- Output fields\n"; 998 for (CeedInt i = 0; i < num_output_fields; i++) { 999 const char *field_name; 1000 1001 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1002 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1003 code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n"; 1004 } 1005 } 1006 1007 // Input and output buffers 1008 code << "\n"; 1009 code << tab << "// -- QFunction inputs and outputs\n"; 1010 code << tab << "// ---- Inputs\n"; 1011 code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; 1012 for (CeedInt i = 0; i < num_input_fields; i++) { 1013 const char *field_name; 1014 1015 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 1016 code << tab << "// ------ Input field " << i << ": " << field_name << "\n"; 1017 code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n"; 1018 } 1019 code << tab << "// ---- Outputs\n"; 1020 code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; 1021 for (CeedInt i = 0; i < num_output_fields; i++) { 1022 const char *field_name; 1023 1024 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1025 code << tab << "// ------ Output field " << i << ": " << field_name << "\n"; 1026 code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n"; 1027 } 1028 1029 // Apply QFunction 1030 code << "\n"; 1031 code << tab << "// -- Apply QFunction\n"; 1032 code << tab << "" << qfunction_name << "(ctx, "; 1033 if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 1034 code << "1"; 1035 } else { 1036 code << Q_name; 1037 } 1038 code << ", inputs, outputs);\n"; 1039 1040 if (is_at_points) { 1041 // Map back to coefficients 1042 code << "\n"; 1043 code << tab << "// -- Output fields\n"; 1044 for (CeedInt i = 0; i < num_output_fields; i++) { 1045 const char *field_name; 1046 std::string var_suffix = "_out_" + std::to_string(i); 1047 std::string P_name = "P_1d" + var_suffix; 1048 1049 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1050 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1051 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1052 // Basis action 1053 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 1054 switch (eval_mode) { 1055 case CEED_EVAL_NONE: { 1056 CeedInt comp_stride; 1057 CeedElemRestriction elem_rstr; 1058 1059 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1060 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 1061 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1062 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 1063 code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 1064 << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]" 1065 << ", r_s" << var_suffix << ", d" << var_suffix << ");\n"; 1066 break; 1067 } 1068 case CEED_EVAL_INTERP: 1069 code << tab << "if (i >= points.num_per_elem[elem]) {\n"; 1070 tab.push(); 1071 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 1072 tab.pop(); 1073 code << tab << "}\n"; 1074 code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 1075 << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 1076 break; 1077 case CEED_EVAL_GRAD: 1078 code << tab << "if (i >= points.num_per_elem[elem]) {\n"; 1079 tab.push(); 1080 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 1081 tab.pop(); 1082 code << tab << "}\n"; 1083 code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 1084 << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 1085 break; 1086 // LCOV_EXCL_START 1087 case CEED_EVAL_WEIGHT: 1088 break; // Should not occur 1089 case CEED_EVAL_DIV: 1090 case CEED_EVAL_CURL: 1091 break; // TODO: Not implemented 1092 // LCOV_EXCL_STOP 1093 } 1094 } 1095 } else if (use_3d_slices) { 1096 // Copy or apply transpose grad, if needed 1097 code << "\n"; 1098 code << tab << "// -- Output fields\n"; 1099 for (CeedInt i = 0; i < num_output_fields; i++) { 1100 const char *field_name; 1101 std::string var_suffix = "_out_" + std::to_string(i); 1102 std::string P_name = "P_1d" + var_suffix; 1103 1104 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1105 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1106 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1107 // Basis action 1108 code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 1109 switch (eval_mode) { 1110 case CEED_EVAL_NONE: 1111 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 1112 tab.push(); 1113 code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 1114 tab.pop(); 1115 code << tab << "}\n"; 1116 break; 1117 case CEED_EVAL_INTERP: 1118 code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 1119 tab.push(); 1120 code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 1121 tab.pop(); 1122 code << tab << "}\n"; 1123 break; 1124 case CEED_EVAL_GRAD: 1125 code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G" 1126 << var_suffix << ", r_q" << var_suffix << ");\n"; 1127 break; 1128 // LCOV_EXCL_START 1129 case CEED_EVAL_WEIGHT: 1130 break; // Should not occur 1131 case CEED_EVAL_DIV: 1132 case CEED_EVAL_CURL: 1133 break; // TODO: Not implemented 1134 // LCOV_EXCL_STOP 1135 } 1136 } 1137 } 1138 tab.pop(); 1139 code << tab << "}\n"; 1140 return CEED_ERROR_SUCCESS; 1141 } 1142 1143 //------------------------------------------------------------------------------ 1144 // Build single operator kernel 1145 //------------------------------------------------------------------------------ 1146 extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) { 1147 bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; 1148 Ceed ceed; 1149 CeedInt Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; 1150 CeedQFunctionField *qf_input_fields, *qf_output_fields; 1151 CeedQFunction_Hip_gen *qf_data; 1152 CeedQFunction qf; 1153 CeedOperatorField *op_input_fields, *op_output_fields; 1154 CeedOperator_Hip_gen *data; 1155 std::ostringstream code; 1156 Tab tab; 1157 1158 CeedCallBackend(CeedOperatorGetData(op, &data)); 1159 { 1160 bool is_setup_done; 1161 1162 CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 1163 if (is_setup_done) { 1164 *is_good_build = !data->use_fallback; 1165 return CEED_ERROR_SUCCESS; 1166 } 1167 } 1168 1169 // Check field compatibility 1170 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 1171 { 1172 bool has_shared_bases = true; 1173 1174 for (CeedInt i = 0; i < num_input_fields; i++) { 1175 CeedBasis basis; 1176 1177 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 1178 if (basis != CEED_BASIS_NONE) { 1179 bool is_tensor = true; 1180 const char *resource; 1181 char *resource_root; 1182 Ceed basis_ceed; 1183 1184 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 1185 is_all_tensor = is_all_tensor && is_tensor; 1186 is_all_nontensor = is_all_nontensor && !is_tensor; 1187 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 1188 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 1189 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 1190 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 1191 CeedCallBackend(CeedFree(&resource_root)); 1192 CeedCallBackend(CeedDestroy(&basis_ceed)); 1193 } 1194 CeedCallBackend(CeedBasisDestroy(&basis)); 1195 } 1196 1197 for (CeedInt i = 0; i < num_output_fields; i++) { 1198 CeedBasis basis; 1199 1200 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 1201 if (basis != CEED_BASIS_NONE) { 1202 bool is_tensor = true; 1203 const char *resource; 1204 char *resource_root; 1205 Ceed basis_ceed; 1206 1207 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 1208 is_all_tensor = is_all_tensor && is_tensor; 1209 is_all_nontensor = is_all_nontensor && !is_tensor; 1210 1211 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 1212 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 1213 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 1214 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 1215 CeedCallBackend(CeedFree(&resource_root)); 1216 CeedCallBackend(CeedDestroy(&basis_ceed)); 1217 } 1218 CeedCallBackend(CeedBasisDestroy(&basis)); 1219 } 1220 // -- Fallback to ref if not all bases are shared 1221 if (!has_shared_bases) { 1222 *is_good_build = false; 1223 return CEED_ERROR_SUCCESS; 1224 } 1225 } 1226 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1227 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1228 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 1229 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 1230 1231 // Get operator data 1232 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 1233 { 1234 CeedInt max_P = 0, max_P_1d = 0; 1235 1236 CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields, 1237 qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices)); 1238 data->max_P_1d = is_all_tensor ? max_P_1d : max_P; 1239 } 1240 if (max_dim == 0) max_dim = 1; 1241 data->dim = max_dim; 1242 if (is_at_points) { 1243 CeedElemRestriction_Hip *rstr_data; 1244 CeedElemRestriction rstr_points = NULL; 1245 1246 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 1247 CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 1248 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 1249 CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data)); 1250 data->points.indices = (CeedInt *)rstr_data->d_offsets; 1251 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 1252 } 1253 if (is_at_points) use_3d_slices = false; 1254 if (Q_1d == 0) { 1255 if (is_at_points) Q_1d = max_num_points; 1256 else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d)); 1257 } 1258 if (Q == 0) Q = Q_1d; 1259 data->Q = Q; 1260 data->Q_1d = Q_1d; 1261 1262 // Check for restriction only identity operator 1263 { 1264 bool is_identity_qf; 1265 1266 CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 1267 if (is_identity_qf) { 1268 CeedEvalMode eval_mode_in, eval_mode_out; 1269 1270 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); 1271 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); 1272 CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, 1273 "Backend does not implement restriction only identity operators"); 1274 } 1275 } 1276 1277 // Load basis source files 1278 if (!is_all_nontensor) { 1279 code << tab << "// Tensor basis source\n"; 1280 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 1281 } 1282 if (!is_all_tensor) { 1283 code << tab << "// Non-tensor basis source\n"; 1284 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n"; 1285 } 1286 if (is_at_points) { 1287 code << tab << "// AtPoints basis source\n"; 1288 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n"; 1289 } 1290 if (!is_all_tensor && !is_all_nontensor) { 1291 code << tab << "// Tensor basis source\n"; 1292 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n"; 1293 } 1294 code << tab << "// CodeGen operator source\n"; 1295 code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 1296 1297 // Get QFunction name 1298 std::string qfunction_name(qf_data->qfunction_name); 1299 std::string operator_name; 1300 1301 operator_name = "CeedKernelHipGenOperator_" + qfunction_name; 1302 1303 // Define CEED_Q_VLA 1304 code << "\n" << tab << "#undef CEED_Q_VLA\n"; 1305 if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 1306 code << tab << "#define CEED_Q_VLA 1\n\n"; 1307 } else { 1308 code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 1309 } 1310 1311 // Add user QFunction source 1312 { 1313 const char *source_path; 1314 1315 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 1316 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 1317 1318 code << tab << "// User QFunction source\n"; 1319 code << tab << "#include \"" << source_path << "\"\n\n"; 1320 } 1321 1322 // Setup 1323 code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 1324 code << tab << "// Operator Kernel\n"; 1325 code << tab << "// \n"; 1326 code << tab << "// d_[in,out]_i: CeedVector device array\n"; 1327 code << tab << "// r_[in,out]_e_i: Element vector register\n"; 1328 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 1329 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 1330 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 1331 code << tab << "// \n"; 1332 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 1333 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 1334 code << tab << "// -----------------------------------------------------------------------------\n"; 1335 code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; 1336 code << "__global__ void " << operator_name 1337 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n"; 1338 tab.push(); 1339 1340 // Scratch buffers 1341 for (CeedInt i = 0; i < num_input_fields; i++) { 1342 CeedEvalMode eval_mode; 1343 1344 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1345 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 1346 code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 1347 } 1348 } 1349 for (CeedInt i = 0; i < num_output_fields; i++) { 1350 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 1351 } 1352 1353 code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 1354 if (!is_all_tensor) { 1355 code << tab << "const CeedInt Q = " << Q << ";\n"; 1356 } 1357 if (!is_all_nontensor) { 1358 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 1359 } 1360 if (is_at_points) { 1361 code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; 1362 code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 1363 } 1364 1365 // Shared data 1366 code << tab << "extern __shared__ CeedScalar slice[];\n"; 1367 code << tab << "SharedData_Hip data;\n"; 1368 code << tab << "data.t_id_x = threadIdx.x;\n"; 1369 code << tab << "data.t_id_y = threadIdx.y;\n"; 1370 code << tab << "data.t_id_z = threadIdx.z;\n"; 1371 code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 1372 code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 1373 1374 // -- Determine input mat reuse 1375 FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 1376 1377 for (CeedInt i = 0; i < num_input_fields; i++) { 1378 input_matrix_reuse[i].index = -1; 1379 } 1380 for (CeedInt i = 0; i < num_input_fields; i++) { 1381 bool is_tensor = true; 1382 CeedEvalMode eval_mode_i; 1383 CeedBasis basis_i; 1384 1385 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 1386 if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 1387 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 1388 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 1389 for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 1390 CeedEvalMode eval_mode_j; 1391 CeedBasis basis_j; 1392 1393 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1394 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1395 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1396 if (basis_i == basis_j) { 1397 if (is_tensor) { 1398 input_matrix_reuse[i].index = j; 1399 input_matrix_reuse[i].is_input = true; 1400 input_matrix_reuse[i].eval_mode = eval_mode_j; 1401 } else { 1402 // For non-tensor can only re-use with the same eval mode 1403 if (eval_mode_i == eval_mode_j) { 1404 input_matrix_reuse[i].index = j; 1405 input_matrix_reuse[i].is_input = true; 1406 input_matrix_reuse[i].eval_mode = eval_mode_j; 1407 } 1408 } 1409 } 1410 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1411 } 1412 CeedCallBackend(CeedBasisDestroy(&basis_i)); 1413 } 1414 1415 // -- Determine output mat reuse 1416 FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 1417 1418 for (CeedInt i = 0; i < num_output_fields; i++) { 1419 output_matrix_reuse[i].index = -1; 1420 } 1421 for (CeedInt i = 0; i < num_output_fields; i++) { 1422 bool is_tensor = true; 1423 CeedEvalMode eval_mode_i; 1424 CeedBasis basis_i; 1425 1426 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 1427 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 1428 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 1429 CeedEvalMode eval_mode_j; 1430 CeedBasis basis_j; 1431 1432 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1433 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1434 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1435 if (basis_i == basis_j) { 1436 if (is_tensor) { 1437 output_matrix_reuse[i].index = j; 1438 output_matrix_reuse[i].is_input = true; 1439 output_matrix_reuse[i].eval_mode = eval_mode_j; 1440 } else { 1441 // For non-tensor can only re-use with the same eval mode 1442 if (eval_mode_i == eval_mode_j) { 1443 output_matrix_reuse[i].index = j; 1444 output_matrix_reuse[i].is_input = true; 1445 output_matrix_reuse[i].eval_mode = eval_mode_j; 1446 } 1447 } 1448 } 1449 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1450 } 1451 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 1452 CeedEvalMode eval_mode_j; 1453 CeedBasis basis_j; 1454 1455 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 1456 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1457 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 1458 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 1459 if (basis_i == basis_j) { 1460 if (is_tensor) { 1461 output_matrix_reuse[i].index = j; 1462 output_matrix_reuse[i].is_input = false; 1463 output_matrix_reuse[i].eval_mode = eval_mode_j; 1464 } else { 1465 // For non-tensor can only re-use with the same eval mode 1466 if (eval_mode_i == eval_mode_j) { 1467 output_matrix_reuse[i].index = j; 1468 output_matrix_reuse[i].is_input = false; 1469 output_matrix_reuse[i].eval_mode = eval_mode_j; 1470 } 1471 } 1472 } 1473 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1474 } 1475 CeedCallBackend(CeedBasisDestroy(&basis_i)); 1476 } 1477 1478 // Initialize constants, and matrices B and G 1479 code << "\n" << tab << "// Input field constants and basis data\n"; 1480 for (CeedInt i = 0; i < num_input_fields; i++) { 1481 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 1482 max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); 1483 } 1484 code << "\n" << tab << "// Output field constants and basis data\n"; 1485 for (CeedInt i = 0; i < num_output_fields; i++) { 1486 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 1487 max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); 1488 } 1489 1490 // Loop over all elements 1491 code << "\n" << tab << "// Element loop\n"; 1492 code << tab << "__syncthreads();\n"; 1493 code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 1494 tab.push(); 1495 1496 // -- Compute minimum buffer space needed 1497 CeedInt max_rstr_buffer_size = 1; 1498 1499 for (CeedInt i = 0; i < num_input_fields; i++) { 1500 CeedEvalMode eval_mode; 1501 1502 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1503 if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 1504 CeedInt num_comp; 1505 CeedElemRestriction elem_rstr; 1506 1507 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 1508 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1509 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1510 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1511 } 1512 } 1513 for (CeedInt i = 0; i < num_output_fields; i++) { 1514 CeedEvalMode eval_mode; 1515 1516 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1517 if (eval_mode != CEED_EVAL_NONE) { 1518 CeedInt num_comp; 1519 CeedElemRestriction elem_rstr; 1520 1521 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1522 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1523 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1524 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1525 } 1526 } 1527 code << tab << "// Scratch restriction buffer space\n"; 1528 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 1529 1530 // -- Determine best input field processing order 1531 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 1532 1533 for (CeedInt i = 0; i < num_input_fields; i++) { 1534 field_rstr_in_buffer[i] = -1; 1535 input_field_order[i] = -1; 1536 } 1537 { 1538 bool is_ordered[CEED_FIELD_MAX]; 1539 CeedInt curr_index = 0; 1540 1541 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 1542 for (CeedInt i = 0; i < num_input_fields; i++) { 1543 CeedVector vec_i; 1544 CeedElemRestriction rstr_i; 1545 1546 if (is_ordered[i]) continue; 1547 field_rstr_in_buffer[i] = i; 1548 is_ordered[i] = true; 1549 input_field_order[curr_index] = i; 1550 curr_index++; 1551 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 1552 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 1553 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 1554 for (CeedInt j = i + 1; j < num_input_fields; j++) { 1555 CeedVector vec_j; 1556 CeedElemRestriction rstr_j; 1557 1558 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 1559 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 1560 if (rstr_i == rstr_j && vec_i == vec_j) { 1561 field_rstr_in_buffer[j] = i; 1562 is_ordered[j] = true; 1563 input_field_order[curr_index] = j; 1564 curr_index++; 1565 } 1566 CeedCallBackend(CeedVectorDestroy(&vec_j)); 1567 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 1568 } 1569 CeedCallBackend(CeedVectorDestroy(&vec_i)); 1570 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 1571 } 1572 } 1573 1574 // -- Input restriction and basis 1575 code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 1576 for (CeedInt i = 0; i < num_input_fields; i++) { 1577 const char *field_name; 1578 const CeedInt f = input_field_order[i]; 1579 1580 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 1581 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 1582 1583 // ---- Restriction 1584 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 1585 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 1586 1587 // ---- Basis action 1588 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 1589 is_all_tensor, is_at_points, use_3d_slices)); 1590 } 1591 1592 // -- Q function 1593 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 1594 qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 1595 Q_1d, is_all_tensor, is_at_points, use_3d_slices)); 1596 1597 // -- Output basis and restriction 1598 code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 1599 for (CeedInt i = 0; i < num_output_fields; i++) { 1600 const char *field_name; 1601 1602 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1603 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1604 1605 // ---- Basis action 1606 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, 1607 is_all_tensor, is_at_points, use_3d_slices)); 1608 1609 // ---- Restriction 1610 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, 1611 false, is_all_tensor, is_at_points, use_3d_slices)); 1612 } 1613 1614 // Close loop and function 1615 tab.pop(); 1616 code << tab << "}\n"; 1617 tab.pop(); 1618 code << tab << "}\n"; 1619 code << tab << "// -----------------------------------------------------------------------------\n\n"; 1620 1621 CeedInt block_sizes[3] = {0, 0, 0}; 1622 CeedInt num_elem; 1623 1624 // Compile 1625 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 1626 CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes)); 1627 { 1628 bool is_compile_good = false; 1629 1630 data->thread_1d = block_sizes[0]; 1631 CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE", 1632 block_sizes[0] * block_sizes[1] * block_sizes[2])); 1633 if (is_compile_good) { 1634 *is_good_build = true; 1635 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op)); 1636 } else { 1637 *is_good_build = false; 1638 data->use_fallback = true; 1639 } 1640 } 1641 CeedCallBackend(CeedOperatorSetSetupDone(op)); 1642 CeedCallBackend(CeedDestroy(&ceed)); 1643 CeedCallBackend(CeedQFunctionDestroy(&qf)); 1644 return CEED_ERROR_SUCCESS; 1645 } 1646 1647 //------------------------------------------------------------------------------ 1648 // Build AtPoints assembly operator kernel 1649 //------------------------------------------------------------------------------ 1650 static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) { 1651 bool is_all_tensor = true, is_at_points = false, use_3d_slices = false; 1652 Ceed ceed; 1653 CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; 1654 CeedQFunctionField *qf_input_fields, *qf_output_fields; 1655 CeedQFunction_Hip_gen *qf_data; 1656 CeedQFunction qf; 1657 CeedOperatorField *op_input_fields, *op_output_fields; 1658 CeedOperator_Hip_gen *data; 1659 std::ostringstream code; 1660 Tab tab; 1661 1662 // Check compatibility 1663 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1664 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 1665 CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported"); 1666 1667 // Retrieve operator data 1668 CeedCallBackend(CeedOperatorGetData(op, &data)); 1669 Q = data->Q; 1670 Q_1d = data->Q_1d; 1671 max_dim = data->dim; 1672 { 1673 CeedElemRestriction rstr_points = NULL; 1674 1675 CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 1676 CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 1677 CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 1678 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 1679 } 1680 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1681 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 1682 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 1683 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 1684 1685 // Load basis source files 1686 code << tab << "// Tensor basis source\n"; 1687 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 1688 code << tab << "// AtPoints basis source\n"; 1689 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n"; 1690 code << tab << "// CodeGen operator source\n"; 1691 code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 1692 1693 // Get QFunction name 1694 std::string qfunction_name(qf_data->qfunction_name); 1695 std::string operator_name; 1696 1697 if (is_full) { 1698 operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name; 1699 } else { 1700 operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name; 1701 } 1702 1703 // Define CEED_Q_VLA 1704 code << "\n" << tab << "#undef CEED_Q_VLA\n"; 1705 code << tab << "#define CEED_Q_VLA 1\n\n"; 1706 1707 // Add user QFunction source 1708 { 1709 const char *source_path; 1710 1711 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 1712 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 1713 1714 code << tab << "// User QFunction source\n"; 1715 code << tab << "#include \"" << source_path << "\"\n\n"; 1716 } 1717 1718 // Setup 1719 code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 1720 code << tab << "// Operator Assembly Kernel\n"; 1721 code << tab << "// \n"; 1722 code << tab << "// d_[in,out]_i: CeedVector device array\n"; 1723 code << tab << "// r_[in,out]_e_i: Element vector register\n"; 1724 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 1725 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 1726 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 1727 code << tab << "// \n"; 1728 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 1729 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 1730 code << tab << "// -----------------------------------------------------------------------------\n"; 1731 code << tab << "extern \"C\" __global__ void " << operator_name 1732 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip " 1733 "points, CeedScalar *__restrict__ values_array) {\n"; 1734 tab.push(); 1735 1736 // Scratch buffers 1737 for (CeedInt i = 0; i < num_input_fields; i++) { 1738 CeedEvalMode eval_mode; 1739 1740 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1741 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 1742 code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 1743 } 1744 } 1745 for (CeedInt i = 0; i < num_output_fields; i++) { 1746 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 1747 } 1748 1749 code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 1750 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 1751 code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; 1752 code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 1753 1754 // Shared data 1755 code << tab << "extern __shared__ CeedScalar slice[];\n"; 1756 code << tab << "SharedData_Hip data;\n"; 1757 code << tab << "data.t_id_x = threadIdx.x;\n"; 1758 code << tab << "data.t_id_y = threadIdx.y;\n"; 1759 code << tab << "data.t_id_z = threadIdx.z;\n"; 1760 code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 1761 code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 1762 1763 // -- Determine input mat reuse 1764 FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 1765 1766 for (CeedInt i = 0; i < num_input_fields; i++) { 1767 input_matrix_reuse[i].index = -1; 1768 } 1769 for (CeedInt i = 0; i < num_input_fields; i++) { 1770 CeedEvalMode eval_mode_i; 1771 CeedBasis basis_i; 1772 1773 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 1774 if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 1775 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 1776 for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 1777 CeedEvalMode eval_mode_j; 1778 CeedBasis basis_j; 1779 1780 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1781 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1782 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1783 if (basis_i == basis_j) { 1784 input_matrix_reuse[i].index = j; 1785 input_matrix_reuse[i].is_input = true; 1786 input_matrix_reuse[i].eval_mode = eval_mode_j; 1787 } 1788 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1789 } 1790 CeedCallBackend(CeedBasisDestroy(&basis_i)); 1791 } 1792 1793 // -- Determine output mat reuse 1794 FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 1795 1796 for (CeedInt i = 0; i < num_output_fields; i++) { 1797 output_matrix_reuse[i].index = -1; 1798 } 1799 for (CeedInt i = 0; i < num_output_fields; i++) { 1800 CeedEvalMode eval_mode_i; 1801 CeedBasis basis_i; 1802 1803 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 1804 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 1805 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 1806 CeedEvalMode eval_mode_j; 1807 CeedBasis basis_j; 1808 1809 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1810 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1811 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1812 if (basis_i == basis_j) { 1813 output_matrix_reuse[i].index = j; 1814 output_matrix_reuse[i].is_input = true; 1815 output_matrix_reuse[i].eval_mode = eval_mode_j; 1816 } 1817 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1818 } 1819 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 1820 CeedEvalMode eval_mode_j; 1821 CeedBasis basis_j; 1822 1823 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 1824 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1825 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 1826 if (basis_i == basis_j) { 1827 output_matrix_reuse[i].index = j; 1828 output_matrix_reuse[i].is_input = false; 1829 output_matrix_reuse[i].eval_mode = eval_mode_j; 1830 } 1831 CeedCallBackend(CeedBasisDestroy(&basis_j)); 1832 } 1833 CeedCallBackend(CeedBasisDestroy(&basis_i)); 1834 } 1835 1836 // Initialize constants, and matrices B and G 1837 code << "\n" << tab << "// Input field constants and basis data\n"; 1838 for (CeedInt i = 0; i < num_input_fields; i++) { 1839 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 1840 max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); 1841 } 1842 code << "\n" << tab << "// Output field constants and basis data\n"; 1843 for (CeedInt i = 0; i < num_output_fields; i++) { 1844 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 1845 max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); 1846 } 1847 1848 // Loop over all elements 1849 code << "\n" << tab << "// Element loop\n"; 1850 code << tab << "__syncthreads();\n"; 1851 code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 1852 tab.push(); 1853 1854 // -- Compute minimum buffer space needed 1855 CeedInt max_rstr_buffer_size = 1; 1856 1857 for (CeedInt i = 0; i < num_input_fields; i++) { 1858 CeedEvalMode eval_mode; 1859 1860 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1861 if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 1862 CeedInt num_comp; 1863 CeedElemRestriction elem_rstr; 1864 1865 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 1866 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1867 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1868 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1869 } 1870 } 1871 for (CeedInt i = 0; i < num_output_fields; i++) { 1872 CeedEvalMode eval_mode; 1873 1874 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1875 if (eval_mode != CEED_EVAL_NONE) { 1876 CeedInt num_comp; 1877 CeedElemRestriction elem_rstr; 1878 1879 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1880 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1881 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1882 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1883 } 1884 } 1885 code << tab << "// Scratch restriction buffer space\n"; 1886 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 1887 1888 // -- Determine best input field processing order 1889 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 1890 1891 for (CeedInt i = 0; i < num_input_fields; i++) { 1892 field_rstr_in_buffer[i] = -1; 1893 input_field_order[i] = -1; 1894 } 1895 { 1896 bool is_ordered[CEED_FIELD_MAX]; 1897 CeedInt curr_index = 0; 1898 1899 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 1900 for (CeedInt i = 0; i < num_input_fields; i++) { 1901 CeedVector vec_i; 1902 CeedElemRestriction rstr_i; 1903 1904 if (is_ordered[i]) continue; 1905 field_rstr_in_buffer[i] = i; 1906 is_ordered[i] = true; 1907 input_field_order[curr_index] = i; 1908 curr_index++; 1909 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 1910 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 1911 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 1912 for (CeedInt j = i + 1; j < num_input_fields; j++) { 1913 CeedVector vec_j; 1914 CeedElemRestriction rstr_j; 1915 1916 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 1917 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 1918 if (rstr_i == rstr_j && vec_i == vec_j) { 1919 field_rstr_in_buffer[j] = i; 1920 is_ordered[j] = true; 1921 input_field_order[curr_index] = j; 1922 curr_index++; 1923 } 1924 CeedCallBackend(CeedVectorDestroy(&vec_j)); 1925 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 1926 } 1927 CeedCallBackend(CeedVectorDestroy(&vec_i)); 1928 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 1929 } 1930 } 1931 1932 // -- Input restriction and basis 1933 code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 1934 CeedInt active_field_index = -1; 1935 1936 for (CeedInt i = 0; i < num_input_fields; i++) { 1937 bool is_active = false; 1938 const char *field_name; 1939 const CeedInt f = input_field_order[i]; 1940 1941 { 1942 CeedVector vec; 1943 1944 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 1945 is_active = vec == CEED_VECTOR_ACTIVE; 1946 CeedCallBackend(CeedVectorDestroy(&vec)); 1947 } 1948 1949 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 1950 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 1951 1952 if (is_active) { 1953 std::string var_suffix = "_in_" + std::to_string(f); 1954 1955 code << tab << "// Active field - no restriction or basis action here\n"; 1956 if (active_field_index == -1) { 1957 active_field_index = f; 1958 code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1") 1959 << "] = {0.0};\n"; 1960 } else { 1961 code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n"; 1962 } 1963 } else { 1964 // ---- Restriction 1965 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 1966 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 1967 1968 // ---- Basis action 1969 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 1970 is_all_tensor, is_at_points, use_3d_slices)); 1971 } 1972 } 1973 1974 // -- Loop over active field 1975 std::string active_var_suffix = "_in_" + std::to_string(active_field_index); 1976 1977 code << "\n" << tab << "// Loop over nodes in active field\n"; 1978 code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix 1979 << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n"; 1980 tab.push(); 1981 1982 // -- Set current active node and component to 1 1983 code << tab << "// Set current active node and component to 1.0\n"; 1984 code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e" 1985 << active_var_suffix << ");\n\n"; 1986 1987 for (CeedInt i = 0; i < num_input_fields; i++) { 1988 bool is_active = false; 1989 const char *field_name; 1990 const CeedInt f = input_field_order[i]; 1991 1992 { 1993 CeedVector vec; 1994 1995 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 1996 is_active = vec == CEED_VECTOR_ACTIVE; 1997 CeedCallBackend(CeedVectorDestroy(&vec)); 1998 } 1999 if (!is_active) continue; 2000 2001 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 2002 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 2003 2004 // ---- Basis action 2005 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 2006 is_all_tensor, is_at_points, use_3d_slices)); 2007 } 2008 2009 // -- Q function 2010 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 2011 qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 2012 Q_1d, is_all_tensor, is_at_points, use_3d_slices)); 2013 2014 // -- Output basis and restriction 2015 code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 2016 for (CeedInt i = 0; i < num_output_fields; i++) { 2017 bool is_active = false; 2018 const char *field_name; 2019 2020 { 2021 CeedVector vec; 2022 2023 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 2024 is_active = vec == CEED_VECTOR_ACTIVE; 2025 CeedCallBackend(CeedVectorDestroy(&vec)); 2026 } 2027 if (!is_active) continue; 2028 2029 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 2030 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 2031 2032 // ---- Basis action 2033 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, 2034 is_all_tensor, is_at_points, use_3d_slices)); 2035 2036 // ---- Restriction 2037 if (is_full) { 2038 std::string var_suffix = "_out_" + std::to_string(i); 2039 CeedInt comp_stride; 2040 CeedSize l_size; 2041 CeedElemRestriction elem_rstr; 2042 2043 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 2044 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 2045 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 2046 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 2047 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 2048 code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix 2049 << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n"; 2050 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2051 } else { 2052 std::string var_suffix = "_out_" + std::to_string(i); 2053 CeedInt comp_stride; 2054 CeedSize l_size; 2055 CeedElemRestriction elem_rstr; 2056 2057 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 2058 CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 2059 code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 2060 CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 2061 code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 2062 code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix 2063 << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n"; 2064 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2065 } 2066 } 2067 2068 // -- Reset current active node and component 2069 code << "\n" << tab << "// Reset current active node and component to 0.0\n"; 2070 code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e" 2071 << active_var_suffix << ");\n"; 2072 2073 // -- End of loop over active field 2074 tab.pop(); 2075 code << tab << "}\n"; 2076 2077 // Close loop and function 2078 tab.pop(); 2079 code << tab << "}\n"; 2080 tab.pop(); 2081 code << tab << "}\n"; 2082 code << tab << "// -----------------------------------------------------------------------------\n\n"; 2083 2084 CeedInt block_sizes[3] = {0, 0, 0}; 2085 CeedInt num_elem; 2086 2087 // Compile 2088 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 2089 CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); 2090 { 2091 bool is_compile_good = false; 2092 2093 data->thread_1d = block_sizes[0]; 2094 CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, 2095 is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0], 2096 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2])); 2097 if (is_compile_good) { 2098 *is_good_build = true; 2099 CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(), 2100 is_full ? &data->assemble_full : &data->assemble_diagonal)); 2101 } else { 2102 *is_good_build = false; 2103 data->use_assembly_fallback = true; 2104 } 2105 } 2106 CeedCallBackend(CeedDestroy(&ceed)); 2107 CeedCallBackend(CeedQFunctionDestroy(&qf)); 2108 return CEED_ERROR_SUCCESS; 2109 } 2110 2111 extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) { 2112 return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build); 2113 } 2114 2115 extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) { 2116 return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build); 2117 } 2118 //------------------------------------------------------------------------------ 2119 // Build QFunction assembly operator kernel 2120 //------------------------------------------------------------------------------ 2121 extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) { 2122 bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; 2123 Ceed ceed; 2124 CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0; 2125 CeedQFunctionField *qf_input_fields, *qf_output_fields; 2126 CeedQFunction_Hip_gen *qf_data; 2127 CeedQFunction qf; 2128 CeedOperatorField *op_input_fields, *op_output_fields; 2129 CeedOperator_Hip_gen *data; 2130 std::ostringstream code; 2131 Tab tab; 2132 2133 // Check compatibility 2134 CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 2135 CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 2136 CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported"); 2137 2138 // Check field compatibility 2139 CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 2140 { 2141 bool has_shared_bases = true; 2142 2143 for (CeedInt i = 0; i < num_input_fields; i++) { 2144 CeedBasis basis; 2145 2146 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 2147 if (basis != CEED_BASIS_NONE) { 2148 bool is_tensor = true; 2149 const char *resource; 2150 char *resource_root; 2151 Ceed basis_ceed; 2152 2153 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 2154 is_all_tensor = is_all_tensor && is_tensor; 2155 is_all_nontensor = is_all_nontensor && !is_tensor; 2156 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 2157 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 2158 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 2159 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 2160 CeedCallBackend(CeedFree(&resource_root)); 2161 CeedCallBackend(CeedDestroy(&basis_ceed)); 2162 } 2163 CeedCallBackend(CeedBasisDestroy(&basis)); 2164 } 2165 2166 for (CeedInt i = 0; i < num_output_fields; i++) { 2167 CeedBasis basis; 2168 2169 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 2170 if (basis != CEED_BASIS_NONE) { 2171 bool is_tensor = true; 2172 const char *resource; 2173 char *resource_root; 2174 Ceed basis_ceed; 2175 2176 CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 2177 is_all_tensor = is_all_tensor && is_tensor; 2178 is_all_nontensor = is_all_nontensor && !is_tensor; 2179 2180 CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 2181 CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 2182 CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 2183 has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 2184 CeedCallBackend(CeedFree(&resource_root)); 2185 CeedCallBackend(CeedDestroy(&basis_ceed)); 2186 } 2187 CeedCallBackend(CeedBasisDestroy(&basis)); 2188 } 2189 } 2190 2191 // Retrieve operator data 2192 CeedCallBackend(CeedOperatorGetData(op, &data)); 2193 Q = data->Q; 2194 Q_1d = data->Q_1d; 2195 max_dim = data->dim; 2196 CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 2197 CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 2198 CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 2199 2200 // Load basis source files 2201 if (!is_all_nontensor) { 2202 code << tab << "// Tensor basis source\n"; 2203 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 2204 } 2205 if (!is_all_tensor) { 2206 code << tab << "// Non-tensor basis source\n"; 2207 code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n"; 2208 } 2209 if (!is_all_tensor && !is_all_nontensor) { 2210 code << "// Tensor basis source\n"; 2211 code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n"; 2212 } 2213 code << "// CodeGen operator source\n"; 2214 code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 2215 2216 // Get QFunction name 2217 std::string qfunction_name(qf_data->qfunction_name); 2218 std::string operator_name; 2219 2220 operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name; 2221 2222 // Define CEED_Q_VLA 2223 code << "\n" << tab << "#undef CEED_Q_VLA\n"; 2224 if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 2225 code << tab << "#define CEED_Q_VLA 1\n\n"; 2226 } else { 2227 code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 2228 } 2229 2230 // Add user QFunction source 2231 { 2232 const char *source_path; 2233 2234 CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 2235 CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 2236 2237 code << tab << "// User QFunction source\n"; 2238 code << tab << "#include \"" << source_path << "\"\n\n"; 2239 } 2240 2241 // Setup 2242 code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 2243 code << tab << "// Operator Assembly Kernel\n"; 2244 code << tab << "// \n"; 2245 code << tab << "// d_[in,out]_i: CeedVector device array\n"; 2246 code << tab << "// r_[in,out]_e_i: Element vector register\n"; 2247 code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 2248 code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 2249 code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 2250 code << tab << "// \n"; 2251 code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 2252 code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 2253 code << tab << "// -----------------------------------------------------------------------------\n"; 2254 code << tab << "extern \"C\" __global__ void " << operator_name 2255 << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip " 2256 "points, CeedScalar *__restrict__ values_array) {\n"; 2257 tab.push(); 2258 2259 // Scratch buffers 2260 for (CeedInt i = 0; i < num_input_fields; i++) { 2261 CeedEvalMode eval_mode; 2262 2263 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 2264 if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 2265 code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 2266 } 2267 } 2268 for (CeedInt i = 0; i < num_output_fields; i++) { 2269 code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 2270 } 2271 2272 code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 2273 if (!is_all_tensor) { 2274 code << tab << "const CeedInt Q = " << Q << ";\n"; 2275 } 2276 if (!is_all_nontensor) { 2277 code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 2278 } 2279 2280 // Shared data 2281 code << tab << "extern __shared__ CeedScalar slice[];\n"; 2282 code << tab << "SharedData_Hip data;\n"; 2283 code << tab << "data.t_id_x = threadIdx.x;\n"; 2284 code << tab << "data.t_id_y = threadIdx.y;\n"; 2285 code << tab << "data.t_id_z = threadIdx.z;\n"; 2286 code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 2287 code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 2288 2289 // -- Determine input mat reuse 2290 FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 2291 2292 for (CeedInt i = 0; i < num_input_fields; i++) { 2293 input_matrix_reuse[i].index = -1; 2294 } 2295 for (CeedInt i = 0; i < num_input_fields; i++) { 2296 bool is_tensor = true; 2297 CeedEvalMode eval_mode_i; 2298 CeedBasis basis_i; 2299 2300 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 2301 if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 2302 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 2303 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 2304 for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 2305 CeedEvalMode eval_mode_j; 2306 CeedBasis basis_j; 2307 2308 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 2309 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 2310 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 2311 if (basis_i == basis_j) { 2312 if (is_tensor) { 2313 input_matrix_reuse[i].index = j; 2314 input_matrix_reuse[i].is_input = true; 2315 input_matrix_reuse[i].eval_mode = eval_mode_j; 2316 } else { 2317 // For non-tensor can only re-use with the same eval mode 2318 if (eval_mode_i == eval_mode_j) { 2319 input_matrix_reuse[i].index = j; 2320 input_matrix_reuse[i].is_input = true; 2321 input_matrix_reuse[i].eval_mode = eval_mode_j; 2322 } 2323 } 2324 } 2325 CeedCallBackend(CeedBasisDestroy(&basis_j)); 2326 } 2327 CeedCallBackend(CeedBasisDestroy(&basis_i)); 2328 } 2329 2330 // -- Determine output mat reuse 2331 FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 2332 2333 for (CeedInt i = 0; i < num_output_fields; i++) { 2334 output_matrix_reuse[i].index = -1; 2335 } 2336 for (CeedInt i = 0; i < num_output_fields; i++) { 2337 bool is_tensor = true; 2338 CeedEvalMode eval_mode_i; 2339 CeedBasis basis_i; 2340 2341 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 2342 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 2343 CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 2344 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 2345 CeedEvalMode eval_mode_j; 2346 CeedBasis basis_j; 2347 2348 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 2349 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 2350 CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 2351 if (basis_i == basis_j) { 2352 if (is_tensor) { 2353 output_matrix_reuse[i].index = j; 2354 output_matrix_reuse[i].is_input = true; 2355 output_matrix_reuse[i].eval_mode = eval_mode_j; 2356 } else { 2357 // For non-tensor can only re-use with the same eval mode 2358 if (eval_mode_i == eval_mode_j) { 2359 output_matrix_reuse[i].index = j; 2360 output_matrix_reuse[i].is_input = true; 2361 output_matrix_reuse[i].eval_mode = eval_mode_j; 2362 } 2363 } 2364 } 2365 CeedCallBackend(CeedBasisDestroy(&basis_j)); 2366 } 2367 for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 2368 CeedEvalMode eval_mode_j; 2369 CeedBasis basis_j; 2370 2371 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 2372 if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 2373 CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 2374 if (basis_i == basis_j) { 2375 if (is_tensor) { 2376 output_matrix_reuse[i].index = j; 2377 output_matrix_reuse[i].is_input = false; 2378 output_matrix_reuse[i].eval_mode = eval_mode_j; 2379 } else { 2380 // For non-tensor can only re-use with the same eval mode 2381 if (eval_mode_i == eval_mode_j) { 2382 output_matrix_reuse[i].index = j; 2383 output_matrix_reuse[i].is_input = false; 2384 output_matrix_reuse[i].eval_mode = eval_mode_j; 2385 } 2386 } 2387 } 2388 CeedCallBackend(CeedBasisDestroy(&basis_j)); 2389 } 2390 CeedCallBackend(CeedBasisDestroy(&basis_i)); 2391 } 2392 2393 // Initialize constants, and matrices B and G 2394 code << "\n" << tab << "// Input field constants and basis data\n"; 2395 for (CeedInt i = 0; i < num_input_fields; i++) { 2396 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 2397 max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true)); 2398 } 2399 code << "\n" << tab << "// Output field constants and basis data\n"; 2400 for (CeedInt i = 0; i < num_output_fields; i++) { 2401 CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 2402 max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true)); 2403 } 2404 2405 // Loop over all elements 2406 code << "\n" << tab << "// Element loop\n"; 2407 code << tab << "__syncthreads();\n"; 2408 code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 2409 tab.push(); 2410 2411 // -- Compute minimum buffer space needed 2412 CeedInt max_rstr_buffer_size = 1; 2413 2414 for (CeedInt i = 0; i < num_input_fields; i++) { 2415 CeedEvalMode eval_mode; 2416 2417 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 2418 if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 2419 CeedInt num_comp; 2420 CeedElemRestriction elem_rstr; 2421 2422 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 2423 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 2424 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 2425 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2426 } 2427 } 2428 for (CeedInt i = 0; i < num_output_fields; i++) { 2429 CeedEvalMode eval_mode; 2430 2431 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 2432 if (eval_mode != CEED_EVAL_NONE) { 2433 CeedInt num_comp; 2434 CeedElemRestriction elem_rstr; 2435 2436 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 2437 CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 2438 max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 2439 CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2440 } 2441 } 2442 code << tab << "// Scratch restriction buffer space\n"; 2443 code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 2444 2445 // -- Determine best input field processing order 2446 CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 2447 2448 for (CeedInt i = 0; i < num_input_fields; i++) { 2449 field_rstr_in_buffer[i] = -1; 2450 input_field_order[i] = -1; 2451 } 2452 { 2453 bool is_ordered[CEED_FIELD_MAX]; 2454 CeedInt curr_index = 0; 2455 2456 for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 2457 for (CeedInt i = 0; i < num_input_fields; i++) { 2458 CeedVector vec_i; 2459 CeedElemRestriction rstr_i; 2460 2461 if (is_ordered[i]) continue; 2462 field_rstr_in_buffer[i] = i; 2463 is_ordered[i] = true; 2464 input_field_order[curr_index] = i; 2465 curr_index++; 2466 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 2467 if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 2468 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 2469 for (CeedInt j = i + 1; j < num_input_fields; j++) { 2470 CeedVector vec_j; 2471 CeedElemRestriction rstr_j; 2472 2473 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 2474 CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 2475 if (rstr_i == rstr_j && vec_i == vec_j) { 2476 field_rstr_in_buffer[j] = i; 2477 is_ordered[j] = true; 2478 input_field_order[curr_index] = j; 2479 curr_index++; 2480 } 2481 CeedCallBackend(CeedVectorDestroy(&vec_j)); 2482 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 2483 } 2484 CeedCallBackend(CeedVectorDestroy(&vec_i)); 2485 CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 2486 } 2487 } 2488 2489 // -- Input restriction and basis 2490 code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 2491 CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0; 2492 CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX]; 2493 2494 for (CeedInt i = 0; i < num_input_fields; i++) { 2495 bool is_active = false; 2496 const char *field_name; 2497 const CeedInt f = input_field_order[i]; 2498 2499 { 2500 CeedVector vec; 2501 2502 CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 2503 is_active = vec == CEED_VECTOR_ACTIVE; 2504 CeedCallBackend(CeedVectorDestroy(&vec)); 2505 } 2506 2507 CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 2508 code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 2509 2510 if (is_active) { 2511 CeedEvalMode eval_mode; 2512 CeedInt field_size; 2513 2514 active_fields_in[num_active_in] = f; 2515 num_active_in++; 2516 CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size)); 2517 CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode)); 2518 if (eval_mode == CEED_EVAL_GRAD) { 2519 code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*" 2520 << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; 2521 } else { 2522 code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; 2523 } 2524 code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n"; 2525 } else { 2526 // ---- Restriction 2527 CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 2528 max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 2529 2530 // ---- Basis action 2531 CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 2532 is_all_tensor, is_at_points, use_3d_slices)); 2533 } 2534 } 2535 code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {"; 2536 for (CeedInt i = 0; i < num_active_in; i++) { 2537 code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); 2538 } 2539 code << "};\n"; 2540 code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {"; 2541 for (CeedInt i = 0; i < num_active_in; i++) { 2542 code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); 2543 } 2544 code << "};\n"; 2545 2546 for (CeedInt i = 0; i < num_output_fields; i++) { 2547 bool is_active = false; 2548 2549 { 2550 CeedVector vec; 2551 2552 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 2553 is_active = vec == CEED_VECTOR_ACTIVE; 2554 CeedCallBackend(CeedVectorDestroy(&vec)); 2555 } 2556 if (is_active) { 2557 const char *field_name; 2558 CeedInt field_size; 2559 2560 active_fields_out[num_active_out] = i; 2561 num_active_out++; 2562 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); 2563 qf_assembly_size_out += field_size; 2564 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 2565 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 2566 code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n"; 2567 } 2568 } 2569 code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {"; 2570 for (CeedInt i = 0; i < num_active_out; i++) { 2571 code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : ""); 2572 } 2573 code << "};\n"; 2574 code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n"; 2575 2576 // -- Loop over active field 2577 code << "\n" << tab << "CeedInt input_offset = 0;\n"; 2578 code << tab << "// Loop over active QFunction input fields\n"; 2579 code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n"; 2580 code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n"; 2581 tab.push(); 2582 2583 // -- Loop over size of active field 2584 code << "\n" << tab << "// Loop over current active input field size\n"; 2585 code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n"; 2586 code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n"; 2587 tab.push(); 2588 2589 // -- Set current active point and component to 1 2590 code << tab << "// Set current active point and component to 1.0\n"; 2591 if (is_all_tensor && (max_dim >= 3)) { 2592 code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n"; 2593 } else { 2594 code << tab << "r_q_in[a][s] = 1.0;\n"; 2595 } 2596 2597 // -- Q function 2598 CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 2599 qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 2600 Q_1d, is_all_tensor, is_at_points, use_3d_slices)); 2601 2602 // -- Output basis and restriction 2603 code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 2604 CeedScalar offset = 0; 2605 2606 for (CeedInt i = 0; i < num_output_fields; i++) { 2607 bool is_active = false; 2608 const char *field_name; 2609 2610 { 2611 CeedVector vec; 2612 2613 CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 2614 is_active = vec == CEED_VECTOR_ACTIVE; 2615 CeedCallBackend(CeedVectorDestroy(&vec)); 2616 } 2617 if (!is_active) continue; 2618 2619 CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 2620 code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 2621 2622 // ---- Restriction 2623 CeedInt field_size; 2624 2625 code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", " 2626 << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n"; 2627 CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); 2628 offset += field_size; 2629 } 2630 2631 // -- Reset current active node and component 2632 code << "\n" << tab << "// Reset current active node and component to 0.0\n"; 2633 if (is_all_tensor && (max_dim >= 3)) { 2634 code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n"; 2635 } else { 2636 code << tab << "r_q_in[a][s] = 0.0;\n"; 2637 } 2638 2639 // -- End of loop over size of active field 2640 tab.pop(); 2641 code << tab << "}\n"; 2642 code << tab << "input_offset += field_size_in;\n"; 2643 2644 // -- End of loop over active field 2645 tab.pop(); 2646 code << tab << "}\n"; 2647 2648 // Close loop and function 2649 tab.pop(); 2650 code << tab << "}\n"; 2651 tab.pop(); 2652 code << tab << "}\n"; 2653 code << tab << "// -----------------------------------------------------------------------------\n\n"; 2654 2655 CeedInt block_sizes[3] = {0, 0, 0}; 2656 CeedInt num_elem; 2657 2658 // Compile 2659 CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 2660 CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); 2661 { 2662 bool is_compile_good = false; 2663 2664 data->thread_1d = block_sizes[0]; 2665 CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0], 2666 "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2])); 2667 if (is_compile_good) { 2668 *is_good_build = true; 2669 CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction)); 2670 } else { 2671 *is_good_build = false; 2672 data->use_assembly_fallback = true; 2673 } 2674 } 2675 CeedCallBackend(CeedDestroy(&ceed)); 2676 CeedCallBackend(CeedQFunctionDestroy(&qf)); 2677 return CEED_ERROR_SUCCESS; 2678 } 2679 2680 //------------------------------------------------------------------------------ 2681