19ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 37d8d0e25Snbeams // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 57d8d0e25Snbeams // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 73d576824SJeremy L Thompson 87d8d0e25Snbeams #define CEED_DEBUG_COLOR 12 97d8d0e25Snbeams 1049aac155SJeremy L Thompson #include <ceed.h> 11ec3da8bcSJed Brown #include <ceed/backend.h> 120183ed61SJeremy L Thompson #include <ceed/gen-tools.h> 139e201c85SYohann #include <ceed/jit-tools.h> 142b730f8bSJeremy L Thompson 157d8d0e25Snbeams #include <iostream> 167d8d0e25Snbeams #include <sstream> 172b730f8bSJeremy L Thompson #include <string> 182b730f8bSJeremy L Thompson 190d0321e0SJeremy L Thompson #include "../hip-ref/ceed-hip-ref.h" 207d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h" 21b2165e7aSSebastian Grimberg #include "../hip/ceed-hip-common.h" 227d8d0e25Snbeams #include "../hip/ceed-hip-compile.h" 232b730f8bSJeremy L Thompson #include "ceed-hip-gen.h" 24b3e1519bSnbeams 2545a787f7SJeremy L Thompson struct FieldReuse_Hip { 2645a787f7SJeremy L Thompson CeedInt index; 2745a787f7SJeremy L Thompson bool is_input; 2845a787f7SJeremy L Thompson CeedEvalMode eval_mode; 2945a787f7SJeremy L Thompson }; 3045a787f7SJeremy L Thompson 31b3e1519bSnbeams //------------------------------------------------------------------------------ 32b3e1519bSnbeams // Calculate the block size used for launching the operator kernel 33b3e1519bSnbeams //------------------------------------------------------------------------------ 342b730f8bSJeremy L Thompson extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { 353a2968d6SJeremy L Thompson const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 36b3e1519bSnbeams if (dim == 1) { 373a2968d6SJeremy L Thompson CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 38b7453713SJeremy L Thompson 399e201c85SYohann elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 403a2968d6SJeremy L Thompson block_sizes[0] = thread_1d; 41b3e1519bSnbeams block_sizes[1] = 1; 429e201c85SYohann block_sizes[2] = elems_per_block; 43b3e1519bSnbeams } else if (dim == 2) { 443a2968d6SJeremy L Thompson const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2; 45b7453713SJeremy L Thompson 463a2968d6SJeremy L Thompson block_sizes[0] = thread_1d; 473a2968d6SJeremy L Thompson block_sizes[1] = thread_1d; 489e201c85SYohann block_sizes[2] = elems_per_block; 49b3e1519bSnbeams } else if (dim == 3) { 503a2968d6SJeremy L Thompson const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1); 51b7453713SJeremy L Thompson 523a2968d6SJeremy L Thompson block_sizes[0] = thread_1d; 533a2968d6SJeremy L Thompson block_sizes[1] = thread_1d; 549e201c85SYohann block_sizes[2] = elems_per_block; 55b3e1519bSnbeams } 56b3e1519bSnbeams return CEED_ERROR_SUCCESS; 57b3e1519bSnbeams } 58b3e1519bSnbeams 597d8d0e25Snbeams //------------------------------------------------------------------------------ 604b3e95d5SJeremy L Thompson // Determine type of operator 614b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 624b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 634b3e95d5SJeremy L Thompson CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields, 6474398b5aSJeremy L Thompson CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d, 6574398b5aSJeremy L Thompson CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) { 6674398b5aSJeremy L Thompson // Check if all are tensor 6774398b5aSJeremy L Thompson *is_all_tensor = true; 684b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 694b3e95d5SJeremy L Thompson CeedBasis basis; 704b3e95d5SJeremy L Thompson 714b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 724b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 734b3e95d5SJeremy L Thompson bool is_field_tensor; 744b3e95d5SJeremy L Thompson 754b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 7674398b5aSJeremy L Thompson *is_all_tensor = *is_all_tensor && is_field_tensor; 774b3e95d5SJeremy L Thompson } 783a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 794b3e95d5SJeremy L Thompson } 804b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 814b3e95d5SJeremy L Thompson CeedBasis basis; 824b3e95d5SJeremy L Thompson 834b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 844b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 854b3e95d5SJeremy L Thompson bool is_field_tensor; 864b3e95d5SJeremy L Thompson 874b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 8874398b5aSJeremy L Thompson *is_all_tensor = *is_all_tensor && is_field_tensor; 8974398b5aSJeremy L Thompson } 9074398b5aSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 9174398b5aSJeremy L Thompson } 9274398b5aSJeremy L Thompson 9374398b5aSJeremy L Thompson // Find max_P, max_P_1d, Q, and Q_1d 9474398b5aSJeremy L Thompson bool is_all_3d = true; 9574398b5aSJeremy L Thompson 9674398b5aSJeremy L Thompson *max_P = 0; 9774398b5aSJeremy L Thompson *max_P_1d = 0; 9874398b5aSJeremy L Thompson *Q = 0; 9974398b5aSJeremy L Thompson *Q_1d = 0; 10074398b5aSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 10174398b5aSJeremy L Thompson CeedBasis basis; 10274398b5aSJeremy L Thompson 10374398b5aSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 10474398b5aSJeremy L Thompson if (basis != CEED_BASIS_NONE) { 10574398b5aSJeremy L Thompson bool is_field_tensor; 10674398b5aSJeremy L Thompson CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; 10774398b5aSJeremy L Thompson 10874398b5aSJeremy L Thompson // Check if 3D 1094b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 11074398b5aSJeremy L Thompson is_all_3d = is_all_3d && (field_dim == 3); 11174398b5aSJeremy L Thompson *max_dim = CeedIntMax(*max_dim, field_dim); 11274398b5aSJeremy L Thompson 11374398b5aSJeremy L Thompson // Collect P, P_1d, Q, and Q_1d 11474398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); 11574398b5aSJeremy L Thompson *max_P = CeedIntMax(*max_P, field_P); 11674398b5aSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 11774398b5aSJeremy L Thompson if (is_field_tensor) { 11874398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 11974398b5aSJeremy L Thompson *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 12074398b5aSJeremy L Thompson } 12174398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); 12274398b5aSJeremy L Thompson CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 12374398b5aSJeremy L Thompson *Q = field_Q; 12474398b5aSJeremy L Thompson if (is_field_tensor) { 12574398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 1264b3e95d5SJeremy L Thompson CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 1274b3e95d5SJeremy L Thompson *Q_1d = field_Q_1d; 1284b3e95d5SJeremy L Thompson } 12974398b5aSJeremy L Thompson } 13074398b5aSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 13174398b5aSJeremy L Thompson } 13274398b5aSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 13374398b5aSJeremy L Thompson CeedBasis basis; 13474398b5aSJeremy L Thompson 13574398b5aSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 13674398b5aSJeremy L Thompson if (basis != CEED_BASIS_NONE) { 13774398b5aSJeremy L Thompson bool is_field_tensor; 13874398b5aSJeremy L Thompson CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; 13974398b5aSJeremy L Thompson 14074398b5aSJeremy L Thompson // Check if 3D 14174398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 14274398b5aSJeremy L Thompson is_all_3d = is_all_3d && (field_dim == 3); 14374398b5aSJeremy L Thompson *max_dim = CeedIntMax(*max_dim, field_dim); 14474398b5aSJeremy L Thompson 14574398b5aSJeremy L Thompson // Collect P, P_1d, Q, and Q_1d 14674398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); 14774398b5aSJeremy L Thompson *max_P = CeedIntMax(*max_P, field_P); 14874398b5aSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 14974398b5aSJeremy L Thompson if (is_field_tensor) { 15074398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 15174398b5aSJeremy L Thompson *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 15274398b5aSJeremy L Thompson } 15374398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); 15474398b5aSJeremy L Thompson CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 15574398b5aSJeremy L Thompson *Q = field_Q; 15674398b5aSJeremy L Thompson if (is_field_tensor) { 15774398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 15874398b5aSJeremy L Thompson CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 15974398b5aSJeremy L Thompson *Q_1d = field_Q_1d; 16074398b5aSJeremy L Thompson } 16174398b5aSJeremy L Thompson } 1623a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 1634b3e95d5SJeremy L Thompson } 1644b3e95d5SJeremy L Thompson 1654b3e95d5SJeremy L Thompson // Only use 3D collocated gradient parallelization strategy when gradient is computed 1664b3e95d5SJeremy L Thompson *use_3d_slices = false; 16774398b5aSJeremy L Thompson if (is_all_3d && *is_all_tensor) { 1684b3e95d5SJeremy L Thompson bool was_grad_found = false; 1694b3e95d5SJeremy L Thompson 1704b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1714b3e95d5SJeremy L Thompson CeedEvalMode eval_mode; 1724b3e95d5SJeremy L Thompson 1734b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1744b3e95d5SJeremy L Thompson if (eval_mode == CEED_EVAL_GRAD) { 1754b3e95d5SJeremy L Thompson CeedBasis_Hip_shared *basis_data; 1764b3e95d5SJeremy L Thompson CeedBasis basis; 1774b3e95d5SJeremy L Thompson 1784b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 1794b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 1804b3e95d5SJeremy L Thompson *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 1814b3e95d5SJeremy L Thompson was_grad_found = true; 1823a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 1834b3e95d5SJeremy L Thompson } 1844b3e95d5SJeremy L Thompson } 1854b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 1864b3e95d5SJeremy L Thompson CeedEvalMode eval_mode; 1874b3e95d5SJeremy L Thompson 1884b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1894b3e95d5SJeremy L Thompson if (eval_mode == CEED_EVAL_GRAD) { 1904b3e95d5SJeremy L Thompson CeedBasis_Hip_shared *basis_data; 1914b3e95d5SJeremy L Thompson CeedBasis basis; 1924b3e95d5SJeremy L Thompson 1934b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 1944b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 1954b3e95d5SJeremy L Thompson *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 1964b3e95d5SJeremy L Thompson was_grad_found = true; 1973a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 1984b3e95d5SJeremy L Thompson } 1994b3e95d5SJeremy L Thompson } 2004b3e95d5SJeremy L Thompson } 2014b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 2024b3e95d5SJeremy L Thompson } 2034b3e95d5SJeremy L Thompson 2044b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 2054b3e95d5SJeremy L Thompson // Setup fields 2064b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 2070183ed61SJeremy L Thompson static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, 2080183ed61SJeremy L Thompson CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse, 2090183ed61SJeremy L Thompson CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, 210ca1da9b9SJeremy L Thompson bool use_3d_slices, bool skip_active_load) { 211ca1da9b9SJeremy L Thompson bool is_tensor = true, is_active = true; 21274398b5aSJeremy L Thompson CeedBasis basis; 213ca1da9b9SJeremy L Thompson 21474398b5aSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 21574398b5aSJeremy L Thompson if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 216ca1da9b9SJeremy L Thompson { 217ca1da9b9SJeremy L Thompson CeedVector vec; 218ca1da9b9SJeremy L Thompson 219ca1da9b9SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec)); 220ca1da9b9SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 221ca1da9b9SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 222ca1da9b9SJeremy L Thompson } 22374398b5aSJeremy L Thompson 22459fa3f92SJeremy L Thompson const char *field_name; 2254b3e95d5SJeremy L Thompson std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 2269123fb08SJeremy L Thompson std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 2274b3e95d5SJeremy L Thompson std::string option_name = (is_input ? "inputs" : "outputs"); 2284b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 22974398b5aSJeremy L Thompson CeedInt elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0; 2304b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 2314b3e95d5SJeremy L Thompson CeedBasis_Hip_shared *basis_data; 2324b3e95d5SJeremy L Thompson 2339ee499e5SJeremy L Thompson // Field reuse info 23445a787f7SJeremy L Thompson bool use_previous_field = field_reuse.index != -1; 2359ee499e5SJeremy L Thompson 23659fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name)); 2370183ed61SJeremy L Thompson code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n"; 2384b3e95d5SJeremy L Thompson 2394b3e95d5SJeremy L Thompson // Get field data 2404b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 2414b3e95d5SJeremy L Thompson if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 2424b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 2434b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 2444b3e95d5SJeremy L Thompson } 2453a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2464b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 2474b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 24874398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 2499123fb08SJeremy L Thompson if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 2509123fb08SJeremy L Thompson else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 2514b3e95d5SJeremy L Thompson } 2524b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 2534b3e95d5SJeremy L Thompson 2544b3e95d5SJeremy L Thompson // Set field constants 2550183ed61SJeremy L Thompson code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n"; 25674398b5aSJeremy L Thompson if (is_tensor && !is_all_tensor) { 25774398b5aSJeremy L Thompson CeedInt P = 0; 25874398b5aSJeremy L Thompson 25974398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 2600183ed61SJeremy L Thompson code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n"; 26174398b5aSJeremy L Thompson } 2620183ed61SJeremy L Thompson code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n"; 263343e3094SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 2640183ed61SJeremy L Thompson code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; 2654b3e95d5SJeremy L Thompson } 2664b3e95d5SJeremy L Thompson 2674b3e95d5SJeremy L Thompson // Load basis data 2680183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 2694b3e95d5SJeremy L Thompson switch (eval_mode) { 2704b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 2714b3e95d5SJeremy L Thompson break; 2724b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 2733a2968d6SJeremy L Thompson if (is_at_points) { 2743a2968d6SJeremy L Thompson // AtPoints 2753a2968d6SJeremy L Thompson if (!basis_data->d_chebyshev_interp_1d) { 2763a2968d6SJeremy L Thompson CeedSize interp_bytes; 2773a2968d6SJeremy L Thompson CeedScalar *chebyshev_interp_1d; 2783a2968d6SJeremy L Thompson 2793a2968d6SJeremy L Thompson interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 2803a2968d6SJeremy L Thompson CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 2813a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 2823a2968d6SJeremy L Thompson CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 2833a2968d6SJeremy L Thompson CeedCallHip(CeedBasisReturnCeed(basis), 2843a2968d6SJeremy L Thompson hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 2853a2968d6SJeremy L Thompson CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 2863a2968d6SJeremy L Thompson } 2873a2968d6SJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 2883a2968d6SJeremy L Thompson else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 2893a2968d6SJeremy L Thompson } else { 2903a2968d6SJeremy L Thompson // Standard quadrature 2914b3e95d5SJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 2924b3e95d5SJeremy L Thompson else data->B.outputs[i] = basis_data->d_interp_1d; 2933a2968d6SJeremy L Thompson } 294ca1da9b9SJeremy L Thompson if (use_previous_field && !skip_active_load) { 29545a787f7SJeremy L Thompson std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 2969ee499e5SJeremy L Thompson 2970183ed61SJeremy L Thompson code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; 2989ee499e5SJeremy L Thompson } else { 2990ccda8ebSJeremy L Thompson bool is_collocated = false; 3000ccda8ebSJeremy L Thompson 3010ccda8ebSJeremy L Thompson CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 302ca1da9b9SJeremy L Thompson if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { 3030ccda8ebSJeremy L Thompson code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; 3040ccda8ebSJeremy L Thompson } else { 3050183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 3060183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 3079ee499e5SJeremy L Thompson } 3080ccda8ebSJeremy L Thompson } 3094b3e95d5SJeremy L Thompson break; 3104b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 3113a2968d6SJeremy L Thompson if (is_at_points) { 3123a2968d6SJeremy L Thompson // AtPoints 3133a2968d6SJeremy L Thompson if (!basis_data->d_chebyshev_interp_1d) { 3143a2968d6SJeremy L Thompson CeedSize interp_bytes; 3153a2968d6SJeremy L Thompson CeedScalar *chebyshev_interp_1d; 3163a2968d6SJeremy L Thompson 3173a2968d6SJeremy L Thompson interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 3183a2968d6SJeremy L Thompson CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 3193a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 3203a2968d6SJeremy L Thompson CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 3213a2968d6SJeremy L Thompson CeedCallHip(CeedBasisReturnCeed(basis), 3223a2968d6SJeremy L Thompson hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 3233a2968d6SJeremy L Thompson CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 3243a2968d6SJeremy L Thompson } 3253a2968d6SJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 3263a2968d6SJeremy L Thompson else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 3273a2968d6SJeremy L Thompson } else { 3283a2968d6SJeremy L Thompson // Standard quadrature 3294b3e95d5SJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 3304b3e95d5SJeremy L Thompson else data->B.outputs[i] = basis_data->d_interp_1d; 3313a2968d6SJeremy L Thompson } 3329123fb08SJeremy L Thompson if (is_tensor) { 333ca1da9b9SJeremy L Thompson if (use_previous_field && !skip_active_load) { 33445a787f7SJeremy L Thompson std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 3359ee499e5SJeremy L Thompson 3360183ed61SJeremy L Thompson code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; 3379ee499e5SJeremy L Thompson } else { 3380ccda8ebSJeremy L Thompson bool is_collocated = false; 3390ccda8ebSJeremy L Thompson 3400ccda8ebSJeremy L Thompson CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 341ca1da9b9SJeremy L Thompson if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { 3420ccda8ebSJeremy L Thompson code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; 3430ccda8ebSJeremy L Thompson } else { 3440183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 3450183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 3469123fb08SJeremy L Thompson } 3479ee499e5SJeremy L Thompson } 3480ccda8ebSJeremy L Thompson } 3493a2968d6SJeremy L Thompson if (is_at_points) break; // No G mat for AtPoints 3504b3e95d5SJeremy L Thompson if (use_3d_slices) { 3514b3e95d5SJeremy L Thompson if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d; 3524b3e95d5SJeremy L Thompson else data->G.outputs[i] = basis_data->d_collo_grad_1d; 353ca1da9b9SJeremy L Thompson if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 35445a787f7SJeremy L Thompson std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 3559ee499e5SJeremy L Thompson 3560183ed61SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 357ca1da9b9SJeremy L Thompson } else if (is_active && skip_active_load) { 358ca1da9b9SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 3599ee499e5SJeremy L Thompson } else { 3600183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 3610183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 3629ee499e5SJeremy L Thompson } 3634b3e95d5SJeremy L Thompson } else { 3644b3e95d5SJeremy L Thompson bool has_collo_grad = basis_data->d_collo_grad_1d; 3654b3e95d5SJeremy L Thompson 3664b3e95d5SJeremy L Thompson if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 3674b3e95d5SJeremy L Thompson else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 3684b3e95d5SJeremy L Thompson if (has_collo_grad) { 369ca1da9b9SJeremy L Thompson if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 37045a787f7SJeremy L Thompson std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 3719ee499e5SJeremy L Thompson 3720183ed61SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 373ca1da9b9SJeremy L Thompson } else if (is_active && skip_active_load) { 374ca1da9b9SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 3759ee499e5SJeremy L Thompson } else { 3760183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 3770183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 3789ee499e5SJeremy L Thompson } 3799ee499e5SJeremy L Thompson } else { 380ca1da9b9SJeremy L Thompson if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 38145a787f7SJeremy L Thompson std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 3829ee499e5SJeremy L Thompson 3830183ed61SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 384ca1da9b9SJeremy L Thompson } else if (is_active && skip_active_load) { 385ca1da9b9SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 3864b3e95d5SJeremy L Thompson } else { 3870183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") 38874398b5aSJeremy L Thompson << (is_tensor ? "" : var_suffix) << "];\n"; 3890183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G." 39074398b5aSJeremy L Thompson << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 3914b3e95d5SJeremy L Thompson } 3924b3e95d5SJeremy L Thompson } 3939ee499e5SJeremy L Thompson } 3944b3e95d5SJeremy L Thompson break; 3954b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 3964b3e95d5SJeremy L Thompson break; // No action 3974b3e95d5SJeremy L Thompson // LCOV_EXCL_START 3984b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 3994b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 4004b3e95d5SJeremy L Thompson break; // TODO: Not implemented 4014b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 4024b3e95d5SJeremy L Thompson } 4033a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 4044b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 4054b3e95d5SJeremy L Thompson } 4064b3e95d5SJeremy L Thompson 4074b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 4084b3e95d5SJeremy L Thompson // Restriction 4094b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 4100183ed61SJeremy L Thompson static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, 4110183ed61SJeremy L Thompson CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, 4120183ed61SJeremy L Thompson CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, 4130183ed61SJeremy L Thompson bool use_3d_slices) { 4144b3e95d5SJeremy L Thompson std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 41574398b5aSJeremy L Thompson std::string P_name = (is_all_tensor ? "P_1d" : "P") + var_suffix; 4164b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 41774398b5aSJeremy L Thompson CeedInt elem_size = 0, num_comp = 0; 4184b3e95d5SJeremy L Thompson CeedSize l_size; 419f815fac9SJeremy L Thompson CeedRestrictionType rstr_type = CEED_RESTRICTION_STANDARD; 4204b3e95d5SJeremy L Thompson CeedElemRestriction_Hip *rstr_data; 4214b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 4224b3e95d5SJeremy L Thompson 4234b3e95d5SJeremy L Thompson // Get field data 4244b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 4254b3e95d5SJeremy L Thompson if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 426f815fac9SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); 4274b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 4284b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 4294b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 4304b3e95d5SJeremy L Thompson } 4314b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 4324b3e95d5SJeremy L Thompson 4334b3e95d5SJeremy L Thompson // Restriction 4344b3e95d5SJeremy L Thompson if (is_input) { 4354b3e95d5SJeremy L Thompson // Input 436e93651e5SJeremy L Thompson if (field_input_buffer[i] != i) { 437e93651e5SJeremy L Thompson std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]); 438e93651e5SJeremy L Thompson 439e93651e5SJeremy L Thompson // Restriction was already done for previous input 4400183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n"; 4413a2968d6SJeremy L Thompson } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) { 4423a2968d6SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) { 443e93651e5SJeremy L Thompson // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated 4440183ed61SJeremy L Thompson code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n"; 4453a2968d6SJeremy L Thompson } else if (rstr_type != CEED_RESTRICTION_POINTS) { 446e93651e5SJeremy L Thompson // Otherwise we're using the scratch space 4470183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 448e93651e5SJeremy L Thompson } 449f815fac9SJeremy L Thompson switch (rstr_type) { 450f815fac9SJeremy L Thompson case CEED_RESTRICTION_STANDARD: { 4514b3e95d5SJeremy L Thompson CeedInt comp_stride; 4524b3e95d5SJeremy L Thompson 4534b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 4540183ed61SJeremy L Thompson code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 4554b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 4560183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 4574b3e95d5SJeremy L Thompson data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 4580183ed61SJeremy L Thompson code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " 4590183ed61SJeremy L Thompson << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix 4600183ed61SJeremy L Thompson << ");\n"; 461f815fac9SJeremy L Thompson break; 462f815fac9SJeremy L Thompson } 463f815fac9SJeremy L Thompson case CEED_RESTRICTION_STRIDED: { 4644b3e95d5SJeremy L Thompson bool has_backend_strides; 4654b3e95d5SJeremy L Thompson CeedInt num_elem; 4664b3e95d5SJeremy L Thompson 4674b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 4684b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 4694b3e95d5SJeremy L Thompson CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 4704b3e95d5SJeremy L Thompson 4714b3e95d5SJeremy L Thompson if (!has_backend_strides) { 4724b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 4734b3e95d5SJeremy L Thompson } 4740183ed61SJeremy L Thompson code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 4750183ed61SJeremy L Thompson << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 4760183ed61SJeremy L Thompson code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides" 4770183ed61SJeremy L Thompson << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e" 4780183ed61SJeremy L Thompson << var_suffix << ");\n"; 479f815fac9SJeremy L Thompson break; 480f815fac9SJeremy L Thompson } 4813a2968d6SJeremy L Thompson case CEED_RESTRICTION_POINTS: { 4823a2968d6SJeremy L Thompson CeedInt comp_stride; 4833a2968d6SJeremy L Thompson 4843a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 4850183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 4863a2968d6SJeremy L Thompson data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 4873a2968d6SJeremy L Thompson break; 4883a2968d6SJeremy L Thompson } 489f815fac9SJeremy L Thompson // LCOV_EXCL_START 490f815fac9SJeremy L Thompson case CEED_RESTRICTION_ORIENTED: 491f815fac9SJeremy L Thompson case CEED_RESTRICTION_CURL_ORIENTED: 492f815fac9SJeremy L Thompson break; // TODO: Not implemented 493f815fac9SJeremy L Thompson // LCOV_EXCL_STOP 4944b3e95d5SJeremy L Thompson } 4954b3e95d5SJeremy L Thompson } 4964b3e95d5SJeremy L Thompson } else { 4974b3e95d5SJeremy L Thompson // Output 498f815fac9SJeremy L Thompson switch (rstr_type) { 499f815fac9SJeremy L Thompson case CEED_RESTRICTION_STANDARD: { 5004b3e95d5SJeremy L Thompson CeedInt comp_stride; 5014b3e95d5SJeremy L Thompson 5024b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 5030183ed61SJeremy L Thompson code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 5044b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 5050183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 5064b3e95d5SJeremy L Thompson data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 5070183ed61SJeremy L Thompson code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " 5080183ed61SJeremy L Thompson << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix 5090183ed61SJeremy L Thompson << ");\n"; 510f815fac9SJeremy L Thompson break; 511f815fac9SJeremy L Thompson } 512f815fac9SJeremy L Thompson case CEED_RESTRICTION_STRIDED: { 5134b3e95d5SJeremy L Thompson bool has_backend_strides; 5144b3e95d5SJeremy L Thompson CeedInt num_elem; 5154b3e95d5SJeremy L Thompson 5164b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 5174b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 5184b3e95d5SJeremy L Thompson CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 5194b3e95d5SJeremy L Thompson 5204b3e95d5SJeremy L Thompson if (!has_backend_strides) { 5214b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 5224b3e95d5SJeremy L Thompson } 5230183ed61SJeremy L Thompson code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 5240183ed61SJeremy L Thompson << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 5250183ed61SJeremy L Thompson code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides" 5260183ed61SJeremy L Thompson << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix 5270183ed61SJeremy L Thompson << ");\n"; 528f815fac9SJeremy L Thompson break; 529f815fac9SJeremy L Thompson } 5303a2968d6SJeremy L Thompson case CEED_RESTRICTION_POINTS: 5313a2968d6SJeremy L Thompson data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 5323a2968d6SJeremy L Thompson break; 533f815fac9SJeremy L Thompson // LCOV_EXCL_START 534f815fac9SJeremy L Thompson case CEED_RESTRICTION_ORIENTED: 535f815fac9SJeremy L Thompson case CEED_RESTRICTION_CURL_ORIENTED: 536f815fac9SJeremy L Thompson break; // TODO: Not implemented 537f815fac9SJeremy L Thompson // LCOV_EXCL_STOP 5384b3e95d5SJeremy L Thompson } 5394b3e95d5SJeremy L Thompson } 5403a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 5414b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 5424b3e95d5SJeremy L Thompson } 5434b3e95d5SJeremy L Thompson 5444b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 5454b3e95d5SJeremy L Thompson // Basis 5464b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 5470183ed61SJeremy L Thompson static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field, 54874398b5aSJeremy L Thompson CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, 5493a2968d6SJeremy L Thompson bool is_at_points, bool use_3d_slices) { 5500ccda8ebSJeremy L Thompson bool is_tensor = true, is_collocated = true; 55174398b5aSJeremy L Thompson CeedBasis basis; 55274398b5aSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 55374398b5aSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 5540ccda8ebSJeremy L Thompson CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 55574398b5aSJeremy L Thompson 5564b3e95d5SJeremy L Thompson std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 5579123fb08SJeremy L Thompson std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 5584b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 55974398b5aSJeremy L Thompson CeedInt dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0; 5604b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 5614b3e95d5SJeremy L Thompson 5624b3e95d5SJeremy L Thompson // Get field data 5634b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 5644b3e95d5SJeremy L Thompson if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 5654b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 5664b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 5674b3e95d5SJeremy L Thompson } 5683a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 5694b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 57074398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 5719123fb08SJeremy L Thompson if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 5729123fb08SJeremy L Thompson else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 5734b3e95d5SJeremy L Thompson } 5744b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 5754b3e95d5SJeremy L Thompson 5764b3e95d5SJeremy L Thompson // Basis 5770183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 5784b3e95d5SJeremy L Thompson if (is_input) { 5794b3e95d5SJeremy L Thompson switch (eval_mode) { 5804b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 5813a2968d6SJeremy L Thompson if (!use_3d_slices && !is_at_points) { 5820183ed61SJeremy L Thompson code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n"; 5834b3e95d5SJeremy L Thompson } 5844b3e95d5SJeremy L Thompson break; 5854b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 5863a2968d6SJeremy L Thompson if (is_at_points) { 5879123fb08SJeremy L Thompson std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 5889123fb08SJeremy L Thompson 5890183ed61SJeremy L Thompson code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 5900183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 5916b92dc4bSJeremy L Thompson << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 5923a2968d6SJeremy L Thompson } else { 5930ccda8ebSJeremy L Thompson std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 5940ccda8ebSJeremy L Thompson std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 59574398b5aSJeremy L Thompson : "InterpNonTensor"; 59674398b5aSJeremy L Thompson std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 5979123fb08SJeremy L Thompson 5980183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 5990183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e" 60074398b5aSJeremy L Thompson << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 6013a2968d6SJeremy L Thompson } 6024b3e95d5SJeremy L Thompson break; 6034b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 6043a2968d6SJeremy L Thompson if (is_at_points) { 6059123fb08SJeremy L Thompson std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 6069123fb08SJeremy L Thompson 6070183ed61SJeremy L Thompson code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 6080183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 6096b92dc4bSJeremy L Thompson << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 6103a2968d6SJeremy L Thompson } else if (use_3d_slices) { 6110ccda8ebSJeremy L Thompson std::string function_name = 6120ccda8ebSJeremy L Thompson (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d"; 6139123fb08SJeremy L Thompson 6140183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 6150183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 6166b92dc4bSJeremy L Thompson << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 6179123fb08SJeremy L Thompson } else if (is_tensor) { 6180ccda8ebSJeremy L Thompson bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; 6190ccda8ebSJeremy L Thompson std::string function_name = 6200ccda8ebSJeremy L Thompson (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + 6210ccda8ebSJeremy L Thompson std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); 62274398b5aSJeremy L Thompson std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 6239123fb08SJeremy L Thompson 6240183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" 62574398b5aSJeremy L Thompson << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n"; 6260183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e" 62774398b5aSJeremy L Thompson << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 6284b3e95d5SJeremy L Thompson } else { 6299123fb08SJeremy L Thompson std::string function_name = "GradNonTensor"; 6309123fb08SJeremy L Thompson 6310183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 6320183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name 63374398b5aSJeremy L Thompson << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 6344b3e95d5SJeremy L Thompson } 6354b3e95d5SJeremy L Thompson break; 6364b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: { 6373a2968d6SJeremy L Thompson if (is_at_points) { 6380183ed61SJeremy L Thompson code << tab << "// Nothing to do AtPoints\n"; 6393a2968d6SJeremy L Thompson } else { 6404b3e95d5SJeremy L Thompson CeedBasis_Hip_shared *basis_data; 64174398b5aSJeremy L Thompson std::string function_name = is_tensor 64274398b5aSJeremy L Thompson ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 64374398b5aSJeremy L Thompson : "WeightNonTensor"; 6444b3e95d5SJeremy L Thompson 6450183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 6464b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 6474b3e95d5SJeremy L Thompson data->W = basis_data->d_q_weight_1d; 6480183ed61SJeremy L Thompson code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n"; 6493a2968d6SJeremy L Thompson } 6504b3e95d5SJeremy L Thompson break; 6514b3e95d5SJeremy L Thompson } 6524b3e95d5SJeremy L Thompson // LCOV_EXCL_START 6534b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 6544b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 6554b3e95d5SJeremy L Thompson break; // TODO: Not implemented 6564b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 6574b3e95d5SJeremy L Thompson } 6584b3e95d5SJeremy L Thompson } else { 6594b3e95d5SJeremy L Thompson switch (eval_mode) { 6604b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 6610183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n"; 6624b3e95d5SJeremy L Thompson break; // No action 6634b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 6640183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 6653a2968d6SJeremy L Thompson if (is_at_points) { 6669123fb08SJeremy L Thompson std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 6679123fb08SJeremy L Thompson 6680183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix 6696b92dc4bSJeremy L Thompson << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 6703a2968d6SJeremy L Thompson } else { 6719123fb08SJeremy L Thompson std::string function_name = 6720ccda8ebSJeremy L Thompson is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 6730ccda8ebSJeremy L Thompson std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 67474398b5aSJeremy L Thompson : "InterpTransposeNonTensor"; 67574398b5aSJeremy L Thompson std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 6769123fb08SJeremy L Thompson 6770183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q" 67874398b5aSJeremy L Thompson << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 6793a2968d6SJeremy L Thompson } 6804b3e95d5SJeremy L Thompson break; 6814b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 6820183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 6833a2968d6SJeremy L Thompson if (is_at_points) { 6849123fb08SJeremy L Thompson std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 6859123fb08SJeremy L Thompson 6860183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix 6876b92dc4bSJeremy L Thompson << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 6883a2968d6SJeremy L Thompson } else if (use_3d_slices) { 6890ccda8ebSJeremy L Thompson std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 6900ccda8ebSJeremy L Thompson std::to_string(dim) + "d"; 6919123fb08SJeremy L Thompson 6920183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix 6936b92dc4bSJeremy L Thompson << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 6949123fb08SJeremy L Thompson } else if (is_tensor) { 6950ccda8ebSJeremy L Thompson bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; 6960ccda8ebSJeremy L Thompson std::string function_name = 6970ccda8ebSJeremy L Thompson (dim == 1 ? "GradTranspose" 6980ccda8ebSJeremy L Thompson : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + 69974398b5aSJeremy L Thompson std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); 70074398b5aSJeremy L Thompson std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 7019123fb08SJeremy L Thompson 7020183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q" 70374398b5aSJeremy L Thompson << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 7044b3e95d5SJeremy L Thompson } else { 7059123fb08SJeremy L Thompson std::string function_name = "GradTransposeNonTensor"; 7069123fb08SJeremy L Thompson 7070183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name 70874398b5aSJeremy L Thompson << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 7094b3e95d5SJeremy L Thompson } 7104b3e95d5SJeremy L Thompson break; 7114b3e95d5SJeremy L Thompson // LCOV_EXCL_START 7124b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 7134b3e95d5SJeremy L Thompson break; // Should not occur 7144b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 7154b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 7164b3e95d5SJeremy L Thompson break; // TODO: Not implemented 7174b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 7184b3e95d5SJeremy L Thompson } 7194b3e95d5SJeremy L Thompson } 7203a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 7214b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 7224b3e95d5SJeremy L Thompson } 7234b3e95d5SJeremy L Thompson 7244b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 7254b3e95d5SJeremy L Thompson // QFunction 7264b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 7270183ed61SJeremy L Thompson static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim, 7280183ed61SJeremy L Thompson CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 7290183ed61SJeremy L Thompson CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, 7300183ed61SJeremy L Thompson CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields, 7310183ed61SJeremy L Thompson std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points, 732*745f16d1SZach Atkins bool use_3d_slices, bool is_assemble) { 73374398b5aSJeremy L Thompson std::string Q_name = is_all_tensor ? "Q_1d" : "Q"; 7344b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 7354b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 7364b3e95d5SJeremy L Thompson 7378b97b69aSJeremy L Thompson // Setup output arrays 7380183ed61SJeremy L Thompson code << "\n"; 7390183ed61SJeremy L Thompson code << tab << "// -- Output field setup\n"; 7404b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 74159fa3f92SJeremy L Thompson const char *field_name; 7424b3e95d5SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 7434b3e95d5SJeremy L Thompson 74459fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 7450183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 7464b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 7473a2968d6SJeremy L Thompson switch (eval_mode) { 7483a2968d6SJeremy L Thompson case CEED_EVAL_NONE: 7493a2968d6SJeremy L Thompson if (is_at_points) { 7500183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n"; 7513a2968d6SJeremy L Thompson } else { 7520183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") 75374398b5aSJeremy L Thompson << "];\n"; 7544b3e95d5SJeremy L Thompson } 7553a2968d6SJeremy L Thompson break; 7563a2968d6SJeremy L Thompson case CEED_EVAL_INTERP: 7573a2968d6SJeremy L Thompson if (is_at_points) { 7583a2968d6SJeremy L Thompson // Accumulator for point data 7590183ed61SJeremy L Thompson code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; 7600183ed61SJeremy L Thompson code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix 761b8245c6cSJeremy L Thompson << "[i] = 0.0;\n"; 7623a2968d6SJeremy L Thompson } else { 7630183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") 76474398b5aSJeremy L Thompson << "];\n"; 7653a2968d6SJeremy L Thompson } 7663a2968d6SJeremy L Thompson break; 7673a2968d6SJeremy L Thompson case CEED_EVAL_GRAD: 7683a2968d6SJeremy L Thompson if (is_at_points) { 7693a2968d6SJeremy L Thompson // Accumulator for point data 7700183ed61SJeremy L Thompson code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; 7710183ed61SJeremy L Thompson code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix 772b8245c6cSJeremy L Thompson << "[i] = 0.0;\n"; 7733a2968d6SJeremy L Thompson } else if (use_3d_slices) { 7744b3e95d5SJeremy L Thompson // Accumulator for gradient slices 7750183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 7760183ed61SJeremy L Thompson code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n"; 7774b3e95d5SJeremy L Thompson } else { 7780183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" 77974398b5aSJeremy L Thompson << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n"; 7804b3e95d5SJeremy L Thompson } 7813a2968d6SJeremy L Thompson break; 7823a2968d6SJeremy L Thompson case CEED_EVAL_WEIGHT: 7833a2968d6SJeremy L Thompson break; 7843a2968d6SJeremy L Thompson // LCOV_EXCL_START 7853a2968d6SJeremy L Thompson case CEED_EVAL_DIV: 7863a2968d6SJeremy L Thompson case CEED_EVAL_CURL: 7873a2968d6SJeremy L Thompson break; // TODO: Not implemented 7883a2968d6SJeremy L Thompson // LCOV_EXCL_STOP 7894b3e95d5SJeremy L Thompson } 7904b3e95d5SJeremy L Thompson } 7914b3e95d5SJeremy L Thompson 7923a2968d6SJeremy L Thompson if (is_at_points) { 7933a2968d6SJeremy L Thompson // We need to handle batches of points 7940183ed61SJeremy L Thompson code << "\n"; 7950183ed61SJeremy L Thompson code << tab << "// Note: Using batches of points\n"; 7960183ed61SJeremy L Thompson code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n"; 7970183ed61SJeremy L Thompson code << tab << "#pragma unroll\n"; 7980183ed61SJeremy L Thompson code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n"; 7990183ed61SJeremy L Thompson tab.push(); 8000183ed61SJeremy L Thompson code << tab << "const CeedInt p = i % max_num_points;\n\n"; 8013a2968d6SJeremy L Thompson 8020183ed61SJeremy L Thompson code << tab << "// -- Coordinates\n"; 8030183ed61SJeremy L Thompson code << tab << "CeedScalar r_x[max_dim];\n"; 8040183ed61SJeremy L Thompson code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n"; 8053a2968d6SJeremy L Thompson 8060183ed61SJeremy L Thompson code << tab << "// -- Input fields\n"; 8073a2968d6SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 80859fa3f92SJeremy L Thompson const char *field_name; 8093a2968d6SJeremy L Thompson std::string var_suffix = "_in_" + std::to_string(i); 810f725b54bSJeremy L Thompson std::string P_name = "P_1d" + var_suffix; 8113a2968d6SJeremy L Thompson 81259fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 8130183ed61SJeremy L Thompson code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 8143a2968d6SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 8153a2968d6SJeremy L Thompson // Basis action 8160183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 8173a2968d6SJeremy L Thompson switch (eval_mode) { 8183a2968d6SJeremy L Thompson case CEED_EVAL_NONE: 8190183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 8200183ed61SJeremy L Thompson code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 8213a2968d6SJeremy L Thompson << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 8223a2968d6SJeremy L Thompson break; 8233a2968d6SJeremy L Thompson case CEED_EVAL_INTERP: 8240183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 8250183ed61SJeremy L Thompson code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 82674398b5aSJeremy L Thompson << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; 8273a2968d6SJeremy L Thompson break; 8283a2968d6SJeremy L Thompson case CEED_EVAL_GRAD: 8290183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 8300183ed61SJeremy L Thompson code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 83174398b5aSJeremy L Thompson << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; 8323a2968d6SJeremy L Thompson break; 8333a2968d6SJeremy L Thompson case CEED_EVAL_WEIGHT: 8340183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; 8350183ed61SJeremy L Thompson code << tab << "r_s" << var_suffix << "[0] = 1.0;\n"; 8363a2968d6SJeremy L Thompson break; 8373a2968d6SJeremy L Thompson // LCOV_EXCL_START 8383a2968d6SJeremy L Thompson case CEED_EVAL_DIV: 8393a2968d6SJeremy L Thompson case CEED_EVAL_CURL: 8403a2968d6SJeremy L Thompson break; // TODO: Not implemented 8413a2968d6SJeremy L Thompson // LCOV_EXCL_STOP 8423a2968d6SJeremy L Thompson } 8433a2968d6SJeremy L Thompson } 8440183ed61SJeremy L Thompson code << "\n"; 8450183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 8463a2968d6SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 84759fa3f92SJeremy L Thompson const char *field_name; 8483a2968d6SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 8493a2968d6SJeremy L Thompson 85059fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 8510183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 8523a2968d6SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 8533a2968d6SJeremy L Thompson // Basis action 8543a2968d6SJeremy L Thompson switch (eval_mode) { 8553a2968d6SJeremy L Thompson case CEED_EVAL_NONE: 8560183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 8573a2968d6SJeremy L Thompson break; 8583a2968d6SJeremy L Thompson case CEED_EVAL_INTERP: 8590183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 8603a2968d6SJeremy L Thompson break; 8613a2968d6SJeremy L Thompson case CEED_EVAL_GRAD: 8620183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 8633a2968d6SJeremy L Thompson break; 8643a2968d6SJeremy L Thompson // LCOV_EXCL_START 8653a2968d6SJeremy L Thompson case CEED_EVAL_WEIGHT: 8663a2968d6SJeremy L Thompson break; // Should not occur 8673a2968d6SJeremy L Thompson case CEED_EVAL_DIV: 8683a2968d6SJeremy L Thompson case CEED_EVAL_CURL: 8693a2968d6SJeremy L Thompson break; // TODO: Not implemented 8703a2968d6SJeremy L Thompson // LCOV_EXCL_STOP 8713a2968d6SJeremy L Thompson } 8723a2968d6SJeremy L Thompson } 8733a2968d6SJeremy L Thompson 8743a2968d6SJeremy L Thompson } else if (use_3d_slices) { 8754b3e95d5SJeremy L Thompson // We treat quadrature points per slice in 3d to save registers 8760183ed61SJeremy L Thompson code << "\n"; 8770183ed61SJeremy L Thompson code << tab << "// Note: Using planes of 3D elements\n"; 8780183ed61SJeremy L Thompson code << tab << "#pragma unroll\n"; 8790183ed61SJeremy L Thompson code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n"; 8800183ed61SJeremy L Thompson tab.push(); 8810183ed61SJeremy L Thompson code << tab << "// -- Input fields\n"; 8824b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 88359fa3f92SJeremy L Thompson const char *field_name; 8844b3e95d5SJeremy L Thompson std::string var_suffix = "_in_" + std::to_string(i); 8854b3e95d5SJeremy L Thompson 88659fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 8870183ed61SJeremy L Thompson code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 8884b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 8894b3e95d5SJeremy L Thompson // Basis action 8900183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 8914b3e95d5SJeremy L Thompson switch (eval_mode) { 8924b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 8934b3e95d5SJeremy L Thompson bool is_strided; 8944b3e95d5SJeremy L Thompson 8950183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 8964b3e95d5SJeremy L Thompson 8974b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 8984b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 8994b3e95d5SJeremy L Thompson if (is_strided) { 9004b3e95d5SJeremy L Thompson bool has_backend_strides; 9014b3e95d5SJeremy L Thompson CeedInt num_elem, elem_size; 9024b3e95d5SJeremy L Thompson 9034b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 9044b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 9054b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 9064b3e95d5SJeremy L Thompson CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 9074b3e95d5SJeremy L Thompson 9084b3e95d5SJeremy L Thompson if (!has_backend_strides) { 9094b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 9104b3e95d5SJeremy L Thompson } 9110183ed61SJeremy L Thompson code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 9120183ed61SJeremy L Thompson << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 9130183ed61SJeremy L Thompson code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides" 9140183ed61SJeremy L Thompson << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n"; 9154b3e95d5SJeremy L Thompson } else { 9164b3e95d5SJeremy L Thompson CeedSize l_size = 0; 9174b3e95d5SJeremy L Thompson CeedInt comp_stride; 9184b3e95d5SJeremy L Thompson CeedElemRestriction_Hip *rstr_data; 9194b3e95d5SJeremy L Thompson 9204b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 9210183ed61SJeremy L Thompson code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 9224b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 9230183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 9244b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 9254b3e95d5SJeremy L Thompson data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 9260183ed61SJeremy L Thompson code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size" 9270183ed61SJeremy L Thompson << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 9284b3e95d5SJeremy L Thompson } 9299123fb08SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 9304b3e95d5SJeremy L Thompson break; 9314b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 9320183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 9330183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n"; 9340183ed61SJeremy L Thompson tab.push(); 9350183ed61SJeremy L Thompson code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n"; 9360183ed61SJeremy L Thompson tab.pop(); 9370183ed61SJeremy L Thompson code << tab << "}\n"; 9384b3e95d5SJeremy L Thompson break; 9394b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 9400183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 9410183ed61SJeremy L Thompson code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G" 9426b92dc4bSJeremy L Thompson << var_suffix << ", r_s" << var_suffix << ");\n"; 9434b3e95d5SJeremy L Thompson break; 9444b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 9450183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; 9460183ed61SJeremy L Thompson code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n"; 9473a2968d6SJeremy L Thompson break; 9484b3e95d5SJeremy L Thompson // LCOV_EXCL_START 9494b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 9504b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 9514b3e95d5SJeremy L Thompson break; // TODO: Not implemented 9524b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 9534b3e95d5SJeremy L Thompson } 9544b3e95d5SJeremy L Thompson } 9550183ed61SJeremy L Thompson code << "\n"; 9560183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 9574b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 95859fa3f92SJeremy L Thompson const char *field_name; 9594b3e95d5SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 9604b3e95d5SJeremy L Thompson 96159fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 9620183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 9634b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 9644b3e95d5SJeremy L Thompson // Basis action 9654b3e95d5SJeremy L Thompson switch (eval_mode) { 9664b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 9670183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 9683a2968d6SJeremy L Thompson break; 9694b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 9700183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 9714b3e95d5SJeremy L Thompson break; 9724b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 9730183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 9744b3e95d5SJeremy L Thompson break; 9754b3e95d5SJeremy L Thompson // LCOV_EXCL_START 9764b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 9774b3e95d5SJeremy L Thompson break; // Should not occur 9784b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 9794b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 9804b3e95d5SJeremy L Thompson break; // TODO: Not implemented 9814b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 9824b3e95d5SJeremy L Thompson } 9834b3e95d5SJeremy L Thompson } 9844b3e95d5SJeremy L Thompson } else { 9850183ed61SJeremy L Thompson code << "\n"; 9860183ed61SJeremy L Thompson code << tab << "// Note: Using full elements\n"; 9870183ed61SJeremy L Thompson code << tab << "{\n"; 9880183ed61SJeremy L Thompson tab.push(); 9890183ed61SJeremy L Thompson code << tab << "// -- Input fields\n"; 9904b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 99159fa3f92SJeremy L Thompson const char *field_name; 99259fa3f92SJeremy L Thompson 99359fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 9940183ed61SJeremy L Thompson code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 9950183ed61SJeremy L Thompson code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n"; 9964b3e95d5SJeremy L Thompson } 9970183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 9984b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 99959fa3f92SJeremy L Thompson const char *field_name; 100059fa3f92SJeremy L Thompson 100159fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 10020183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 10030183ed61SJeremy L Thompson code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n"; 10044b3e95d5SJeremy L Thompson } 10054b3e95d5SJeremy L Thompson } 10064b3e95d5SJeremy L Thompson 10074b3e95d5SJeremy L Thompson // Input and output buffers 10080183ed61SJeremy L Thompson code << "\n"; 10090183ed61SJeremy L Thompson code << tab << "// -- QFunction inputs and outputs\n"; 10100183ed61SJeremy L Thompson code << tab << "// ---- Inputs\n"; 10110183ed61SJeremy L Thompson code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; 10124b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 101359fa3f92SJeremy L Thompson const char *field_name; 101459fa3f92SJeremy L Thompson 101559fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 10160183ed61SJeremy L Thompson code << tab << "// ------ Input field " << i << ": " << field_name << "\n"; 10170183ed61SJeremy L Thompson code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n"; 10184b3e95d5SJeremy L Thompson } 10190183ed61SJeremy L Thompson code << tab << "// ---- Outputs\n"; 10200183ed61SJeremy L Thompson code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; 10214b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 102259fa3f92SJeremy L Thompson const char *field_name; 102359fa3f92SJeremy L Thompson 102459fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 10250183ed61SJeremy L Thompson code << tab << "// ------ Output field " << i << ": " << field_name << "\n"; 10260183ed61SJeremy L Thompson code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n"; 10274b3e95d5SJeremy L Thompson } 10284b3e95d5SJeremy L Thompson 10294b3e95d5SJeremy L Thompson // Apply QFunction 10300183ed61SJeremy L Thompson code << "\n"; 10310183ed61SJeremy L Thompson code << tab << "// -- Apply QFunction\n"; 10320183ed61SJeremy L Thompson code << tab << "" << qfunction_name << "(ctx, "; 103374398b5aSJeremy L Thompson if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 10344b3e95d5SJeremy L Thompson code << "1"; 10354b3e95d5SJeremy L Thompson } else { 10369123fb08SJeremy L Thompson code << Q_name; 10374b3e95d5SJeremy L Thompson } 10384b3e95d5SJeremy L Thompson code << ", inputs, outputs);\n"; 10394b3e95d5SJeremy L Thompson 10403a2968d6SJeremy L Thompson if (is_at_points) { 10413a2968d6SJeremy L Thompson // Map back to coefficients 10420183ed61SJeremy L Thompson code << "\n"; 10430183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 10443a2968d6SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 104559fa3f92SJeremy L Thompson const char *field_name; 10463a2968d6SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 10473a2968d6SJeremy L Thompson std::string P_name = "P_1d" + var_suffix; 10483a2968d6SJeremy L Thompson 104959fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 10500183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 10513a2968d6SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 10523a2968d6SJeremy L Thompson // Basis action 10530183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 10543a2968d6SJeremy L Thompson switch (eval_mode) { 10553a2968d6SJeremy L Thompson case CEED_EVAL_NONE: { 10563a2968d6SJeremy L Thompson CeedInt comp_stride; 10573a2968d6SJeremy L Thompson CeedElemRestriction elem_rstr; 10583a2968d6SJeremy L Thompson 1059*745f16d1SZach Atkins if (is_assemble) break; 10603a2968d6SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 10613a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 10623a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 10630183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 10640183ed61SJeremy L Thompson code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 10653a2968d6SJeremy L Thompson << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]" 10663a2968d6SJeremy L Thompson << ", r_s" << var_suffix << ", d" << var_suffix << ");\n"; 10673a2968d6SJeremy L Thompson break; 10683a2968d6SJeremy L Thompson } 10693a2968d6SJeremy L Thompson case CEED_EVAL_INTERP: 10700183ed61SJeremy L Thompson code << tab << "if (i >= points.num_per_elem[elem]) {\n"; 10710183ed61SJeremy L Thompson tab.push(); 10720183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 10730183ed61SJeremy L Thompson tab.pop(); 10740183ed61SJeremy L Thompson code << tab << "}\n"; 10750183ed61SJeremy L Thompson code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 1076f725b54bSJeremy L Thompson << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 10773a2968d6SJeremy L Thompson break; 10783a2968d6SJeremy L Thompson case CEED_EVAL_GRAD: 10790183ed61SJeremy L Thompson code << tab << "if (i >= points.num_per_elem[elem]) {\n"; 10800183ed61SJeremy L Thompson tab.push(); 10810183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 10820183ed61SJeremy L Thompson tab.pop(); 10830183ed61SJeremy L Thompson code << tab << "}\n"; 10840183ed61SJeremy L Thompson code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 1085f725b54bSJeremy L Thompson << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 10863a2968d6SJeremy L Thompson break; 10873a2968d6SJeremy L Thompson // LCOV_EXCL_START 10883a2968d6SJeremy L Thompson case CEED_EVAL_WEIGHT: 10893a2968d6SJeremy L Thompson break; // Should not occur 10903a2968d6SJeremy L Thompson case CEED_EVAL_DIV: 10913a2968d6SJeremy L Thompson case CEED_EVAL_CURL: 10923a2968d6SJeremy L Thompson break; // TODO: Not implemented 10933a2968d6SJeremy L Thompson // LCOV_EXCL_STOP 10943a2968d6SJeremy L Thompson } 10953a2968d6SJeremy L Thompson } 10963a2968d6SJeremy L Thompson } else if (use_3d_slices) { 10974b3e95d5SJeremy L Thompson // Copy or apply transpose grad, if needed 10980183ed61SJeremy L Thompson code << "\n"; 10990183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 11004b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 110159fa3f92SJeremy L Thompson const char *field_name; 11024b3e95d5SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 11034b3e95d5SJeremy L Thompson std::string P_name = "P_1d" + var_suffix; 11044b3e95d5SJeremy L Thompson 110559fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 11060183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 11074b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 11084b3e95d5SJeremy L Thompson // Basis action 11090183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 11104b3e95d5SJeremy L Thompson switch (eval_mode) { 11114b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 11120183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 11130183ed61SJeremy L Thompson tab.push(); 11140183ed61SJeremy L Thompson code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 11150183ed61SJeremy L Thompson tab.pop(); 11160183ed61SJeremy L Thompson code << tab << "}\n"; 11173a2968d6SJeremy L Thompson break; 11184b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 11190183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 11200183ed61SJeremy L Thompson tab.push(); 11210183ed61SJeremy L Thompson code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 11220183ed61SJeremy L Thompson tab.pop(); 11230183ed61SJeremy L Thompson code << tab << "}\n"; 11244b3e95d5SJeremy L Thompson break; 11254b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 11260183ed61SJeremy L Thompson code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G" 1127f815fac9SJeremy L Thompson << var_suffix << ", r_q" << var_suffix << ");\n"; 11284b3e95d5SJeremy L Thompson break; 11294b3e95d5SJeremy L Thompson // LCOV_EXCL_START 11304b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 11314b3e95d5SJeremy L Thompson break; // Should not occur 11324b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 11334b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 11344b3e95d5SJeremy L Thompson break; // TODO: Not implemented 11354b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 11364b3e95d5SJeremy L Thompson } 11374b3e95d5SJeremy L Thompson } 11384b3e95d5SJeremy L Thompson } 11390183ed61SJeremy L Thompson tab.pop(); 11400183ed61SJeremy L Thompson code << tab << "}\n"; 11414b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 11424b3e95d5SJeremy L Thompson } 11434b3e95d5SJeremy L Thompson 11444b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 11459e201c85SYohann // Build single operator kernel 11467d8d0e25Snbeams //------------------------------------------------------------------------------ 11478d12f40eSJeremy L Thompson extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) { 114874398b5aSJeremy L Thompson bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; 11497d8d0e25Snbeams Ceed ceed; 1150efa41df3SJeremy L Thompson CeedInt Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; 1151b7453713SJeremy L Thompson CeedQFunctionField *qf_input_fields, *qf_output_fields; 1152b7453713SJeremy L Thompson CeedQFunction_Hip_gen *qf_data; 1153b7453713SJeremy L Thompson CeedQFunction qf; 1154b7453713SJeremy L Thompson CeedOperatorField *op_input_fields, *op_output_fields; 1155b7453713SJeremy L Thompson CeedOperator_Hip_gen *data; 11564b3e95d5SJeremy L Thompson std::ostringstream code; 11570183ed61SJeremy L Thompson Tab tab; 11584b3e95d5SJeremy L Thompson 11598d12f40eSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &data)); 11604b3e95d5SJeremy L Thompson { 11614b3e95d5SJeremy L Thompson bool is_setup_done; 1162b7453713SJeremy L Thompson 1163b7453713SJeremy L Thompson CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 11648d12f40eSJeremy L Thompson if (is_setup_done) { 11658d12f40eSJeremy L Thompson *is_good_build = !data->use_fallback; 11668d12f40eSJeremy L Thompson return CEED_ERROR_SUCCESS; 11678d12f40eSJeremy L Thompson } 11684b3e95d5SJeremy L Thompson } 1169b7453713SJeremy L Thompson 11708d12f40eSJeremy L Thompson // Check field compatibility 11718d12f40eSJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 11728d12f40eSJeremy L Thompson { 117374398b5aSJeremy L Thompson bool has_shared_bases = true; 11748d12f40eSJeremy L Thompson 11758d12f40eSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 11768d12f40eSJeremy L Thompson CeedBasis basis; 11778d12f40eSJeremy L Thompson 11788d12f40eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 11798d12f40eSJeremy L Thompson if (basis != CEED_BASIS_NONE) { 11808d12f40eSJeremy L Thompson bool is_tensor = true; 11818d12f40eSJeremy L Thompson const char *resource; 11828d12f40eSJeremy L Thompson char *resource_root; 11838d12f40eSJeremy L Thompson Ceed basis_ceed; 11848d12f40eSJeremy L Thompson 11858d12f40eSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 1186c9192acaSJeremy L Thompson is_all_tensor = is_all_tensor && is_tensor; 1187c9192acaSJeremy L Thompson is_all_nontensor = is_all_nontensor && !is_tensor; 11888d12f40eSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 11898d12f40eSJeremy L Thompson CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 11908d12f40eSJeremy L Thompson CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 1191c9192acaSJeremy L Thompson has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 11928d12f40eSJeremy L Thompson CeedCallBackend(CeedFree(&resource_root)); 11938d12f40eSJeremy L Thompson CeedCallBackend(CeedDestroy(&basis_ceed)); 11948d12f40eSJeremy L Thompson } 11958d12f40eSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 11968d12f40eSJeremy L Thompson } 11978d12f40eSJeremy L Thompson 11988d12f40eSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 11998d12f40eSJeremy L Thompson CeedBasis basis; 12008d12f40eSJeremy L Thompson 12018d12f40eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 12028d12f40eSJeremy L Thompson if (basis != CEED_BASIS_NONE) { 12038d12f40eSJeremy L Thompson bool is_tensor = true; 12048d12f40eSJeremy L Thompson const char *resource; 12058d12f40eSJeremy L Thompson char *resource_root; 12068d12f40eSJeremy L Thompson Ceed basis_ceed; 12078d12f40eSJeremy L Thompson 12088d12f40eSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 1209c9192acaSJeremy L Thompson is_all_tensor = is_all_tensor && is_tensor; 1210c9192acaSJeremy L Thompson is_all_nontensor = is_all_nontensor && !is_tensor; 12118d12f40eSJeremy L Thompson 12128d12f40eSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 12138d12f40eSJeremy L Thompson CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 12148d12f40eSJeremy L Thompson CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 1215c9192acaSJeremy L Thompson has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 12168d12f40eSJeremy L Thompson CeedCallBackend(CeedFree(&resource_root)); 12178d12f40eSJeremy L Thompson CeedCallBackend(CeedDestroy(&basis_ceed)); 12188d12f40eSJeremy L Thompson } 12198d12f40eSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 12208d12f40eSJeremy L Thompson } 12218d12f40eSJeremy L Thompson // -- Fallback to ref if not all bases are shared 122274398b5aSJeremy L Thompson if (!has_shared_bases) { 12238d12f40eSJeremy L Thompson *is_good_build = false; 12248d12f40eSJeremy L Thompson return CEED_ERROR_SUCCESS; 12258d12f40eSJeremy L Thompson } 12268d12f40eSJeremy L Thompson } 1227b7453713SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1228b7453713SJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1229b7453713SJeremy L Thompson CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 1230b7453713SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 12317d8d0e25Snbeams 12324b3e95d5SJeremy L Thompson // Get operator data 12333a2968d6SJeremy L Thompson CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 123474398b5aSJeremy L Thompson { 1235efa41df3SJeremy L Thompson CeedInt max_P = 0, max_P_1d = 0; 123674398b5aSJeremy L Thompson 12374b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields, 123874398b5aSJeremy L Thompson qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices)); 123974398b5aSJeremy L Thompson data->max_P_1d = is_all_tensor ? max_P_1d : max_P; 124074398b5aSJeremy L Thompson } 12413a2968d6SJeremy L Thompson if (is_at_points) { 1242e31b7a9fSJeremy L Thompson CeedInt coords_dim = 0; 12433a2968d6SJeremy L Thompson CeedElemRestriction_Hip *rstr_data; 12443a2968d6SJeremy L Thompson CeedElemRestriction rstr_points = NULL; 12454b3e95d5SJeremy L Thompson 12463a2968d6SJeremy L Thompson CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 12473a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 12483a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 1249e31b7a9fSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim)); 12503a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data)); 12513a2968d6SJeremy L Thompson data->points.indices = (CeedInt *)rstr_data->d_offsets; 12523a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 1253e31b7a9fSJeremy L Thompson if (max_dim == 0) max_dim = coords_dim; 1254e31b7a9fSJeremy L Thompson if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim)); 12553a2968d6SJeremy L Thompson } 1256e31b7a9fSJeremy L Thompson if (max_dim == 0) max_dim = 1; 1257e31b7a9fSJeremy L Thompson data->dim = max_dim; 12583a2968d6SJeremy L Thompson if (is_at_points) use_3d_slices = false; 12593a2968d6SJeremy L Thompson if (Q_1d == 0) { 12603a2968d6SJeremy L Thompson if (is_at_points) Q_1d = max_num_points; 12613a2968d6SJeremy L Thompson else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d)); 12624b3e95d5SJeremy L Thompson } 126374398b5aSJeremy L Thompson if (Q == 0) Q = Q_1d; 126474398b5aSJeremy L Thompson data->Q = Q; 12654b3e95d5SJeremy L Thompson data->Q_1d = Q_1d; 12664b3e95d5SJeremy L Thompson 12670b454692Sjeremylt // Check for restriction only identity operator 12684b3e95d5SJeremy L Thompson { 12694b3e95d5SJeremy L Thompson bool is_identity_qf; 12704b3e95d5SJeremy L Thompson 12712b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 12720b454692Sjeremylt if (is_identity_qf) { 12739e201c85SYohann CeedEvalMode eval_mode_in, eval_mode_out; 1274b7453713SJeremy L Thompson 12752b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); 12762b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); 12776574a04fSJeremy L Thompson CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, 12786574a04fSJeremy L Thompson "Backend does not implement restriction only identity operators"); 12790b454692Sjeremylt } 12804b3e95d5SJeremy L Thompson } 1281b2165e7aSSebastian Grimberg 1282b2165e7aSSebastian Grimberg // Load basis source files 1283eaf9ad10SZach Atkins if (!is_all_nontensor) { 12840183ed61SJeremy L Thompson code << tab << "// Tensor basis source\n"; 12850183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 128674398b5aSJeremy L Thompson } 128774398b5aSJeremy L Thompson if (!is_all_tensor) { 12880183ed61SJeremy L Thompson code << tab << "// Non-tensor basis source\n"; 12890183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n"; 12909123fb08SJeremy L Thompson } 12919123fb08SJeremy L Thompson if (is_at_points) { 12920183ed61SJeremy L Thompson code << tab << "// AtPoints basis source\n"; 12930183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n"; 12949123fb08SJeremy L Thompson } 129574398b5aSJeremy L Thompson if (!is_all_tensor && !is_all_nontensor) { 12960183ed61SJeremy L Thompson code << tab << "// Tensor basis source\n"; 12970183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n"; 129874398b5aSJeremy L Thompson } 12990183ed61SJeremy L Thompson code << tab << "// CodeGen operator source\n"; 13000183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 13017d8d0e25Snbeams 13024b3e95d5SJeremy L Thompson // Get QFunction name 13034b3e95d5SJeremy L Thompson std::string qfunction_name(qf_data->qfunction_name); 13044b3e95d5SJeremy L Thompson std::string operator_name; 13054b3e95d5SJeremy L Thompson 130609095acaSJeremy L Thompson operator_name = "CeedKernelHipGenOperator_" + qfunction_name; 13077d8d0e25Snbeams 13089e201c85SYohann // Define CEED_Q_VLA 13090183ed61SJeremy L Thompson code << "\n" << tab << "#undef CEED_Q_VLA\n"; 131074398b5aSJeremy L Thompson if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 13110183ed61SJeremy L Thompson code << tab << "#define CEED_Q_VLA 1\n\n"; 13129e201c85SYohann } else { 13130183ed61SJeremy L Thompson code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 13149e201c85SYohann } 13159e201c85SYohann 13164b3e95d5SJeremy L Thompson // Add user QFunction source 13174b3e95d5SJeremy L Thompson { 13189c25dd66SJeremy L Thompson const char *source_path; 13194b3e95d5SJeremy L Thompson 13209c25dd66SJeremy L Thompson CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 13219c25dd66SJeremy L Thompson CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 13229c25dd66SJeremy L Thompson 13230183ed61SJeremy L Thompson code << tab << "// User QFunction source\n"; 13240183ed61SJeremy L Thompson code << tab << "#include \"" << source_path << "\"\n\n"; 13254b3e95d5SJeremy L Thompson } 13267d8d0e25Snbeams 13277d8d0e25Snbeams // Setup 13280183ed61SJeremy L Thompson code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 13290183ed61SJeremy L Thompson code << tab << "// Operator Kernel\n"; 13300183ed61SJeremy L Thompson code << tab << "// \n"; 13310183ed61SJeremy L Thompson code << tab << "// d_[in,out]_i: CeedVector device array\n"; 13320183ed61SJeremy L Thompson code << tab << "// r_[in,out]_e_i: Element vector register\n"; 13330183ed61SJeremy L Thompson code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 13340183ed61SJeremy L Thompson code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 13350183ed61SJeremy L Thompson code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 13360183ed61SJeremy L Thompson code << tab << "// \n"; 13370183ed61SJeremy L Thompson code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 13380183ed61SJeremy L Thompson code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 13390183ed61SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n"; 13400183ed61SJeremy L Thompson code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; 13412b730f8bSJeremy L Thompson code << "__global__ void " << operator_name 13423a2968d6SJeremy L Thompson << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n"; 13430183ed61SJeremy L Thompson tab.push(); 13444b3e95d5SJeremy L Thompson 13454b3e95d5SJeremy L Thompson // Scratch buffers 13469e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 13474b3e95d5SJeremy L Thompson CeedEvalMode eval_mode; 13484b3e95d5SJeremy L Thompson 13492b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 13509e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 13510183ed61SJeremy L Thompson code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 13527d8d0e25Snbeams } 13537d8d0e25Snbeams } 13549e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 13550183ed61SJeremy L Thompson code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 13567d8d0e25Snbeams } 13577d8d0e25Snbeams 13580183ed61SJeremy L Thompson code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 135974398b5aSJeremy L Thompson if (!is_all_tensor) { 13600183ed61SJeremy L Thompson code << tab << "const CeedInt Q = " << Q << ";\n"; 136174398b5aSJeremy L Thompson } 136274398b5aSJeremy L Thompson if (!is_all_nontensor) { 13630183ed61SJeremy L Thompson code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 136474398b5aSJeremy L Thompson } 13653a2968d6SJeremy L Thompson if (is_at_points) { 13660183ed61SJeremy L Thompson code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; 13670183ed61SJeremy L Thompson code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 13683a2968d6SJeremy L Thompson } 13697d8d0e25Snbeams 13704b3e95d5SJeremy L Thompson // Shared data 13710183ed61SJeremy L Thompson code << tab << "extern __shared__ CeedScalar slice[];\n"; 13720183ed61SJeremy L Thompson code << tab << "SharedData_Hip data;\n"; 13730183ed61SJeremy L Thompson code << tab << "data.t_id_x = threadIdx.x;\n"; 13740183ed61SJeremy L Thompson code << tab << "data.t_id_y = threadIdx.y;\n"; 13750183ed61SJeremy L Thompson code << tab << "data.t_id_z = threadIdx.z;\n"; 13760183ed61SJeremy L Thompson code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 13770183ed61SJeremy L Thompson code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 13787d8d0e25Snbeams 13799ee499e5SJeremy L Thompson // -- Determine input mat reuse 138045a787f7SJeremy L Thompson FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 13819ee499e5SJeremy L Thompson 13829ee499e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 138345a787f7SJeremy L Thompson input_matrix_reuse[i].index = -1; 13849ee499e5SJeremy L Thompson } 13859ee499e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 138674398b5aSJeremy L Thompson bool is_tensor = true; 13879ee499e5SJeremy L Thompson CeedEvalMode eval_mode_i; 13889ee499e5SJeremy L Thompson CeedBasis basis_i; 13899ee499e5SJeremy L Thompson 13909ee499e5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 13919ee499e5SJeremy L Thompson if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 13929ee499e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 139374398b5aSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 139445a787f7SJeremy L Thompson for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 13959ee499e5SJeremy L Thompson CeedEvalMode eval_mode_j; 13969ee499e5SJeremy L Thompson CeedBasis basis_j; 13979ee499e5SJeremy L Thompson 13989ee499e5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 13999ee499e5SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 14009ee499e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 14019ee499e5SJeremy L Thompson if (basis_i == basis_j) { 14029ee499e5SJeremy L Thompson if (is_tensor) { 140345a787f7SJeremy L Thompson input_matrix_reuse[i].index = j; 140445a787f7SJeremy L Thompson input_matrix_reuse[i].is_input = true; 140545a787f7SJeremy L Thompson input_matrix_reuse[i].eval_mode = eval_mode_j; 14069ee499e5SJeremy L Thompson } else { 14079ee499e5SJeremy L Thompson // For non-tensor can only re-use with the same eval mode 14089ee499e5SJeremy L Thompson if (eval_mode_i == eval_mode_j) { 140945a787f7SJeremy L Thompson input_matrix_reuse[i].index = j; 141045a787f7SJeremy L Thompson input_matrix_reuse[i].is_input = true; 141145a787f7SJeremy L Thompson input_matrix_reuse[i].eval_mode = eval_mode_j; 14129ee499e5SJeremy L Thompson } 14139ee499e5SJeremy L Thompson } 14149ee499e5SJeremy L Thompson } 14159ee499e5SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 14169ee499e5SJeremy L Thompson } 14179ee499e5SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 14189ee499e5SJeremy L Thompson } 14199ee499e5SJeremy L Thompson 14209ee499e5SJeremy L Thompson // -- Determine output mat reuse 142145a787f7SJeremy L Thompson FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 14229ee499e5SJeremy L Thompson 14239ee499e5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 142445a787f7SJeremy L Thompson output_matrix_reuse[i].index = -1; 14259ee499e5SJeremy L Thompson } 14269ee499e5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 142774398b5aSJeremy L Thompson bool is_tensor = true; 14289ee499e5SJeremy L Thompson CeedEvalMode eval_mode_i; 14299ee499e5SJeremy L Thompson CeedBasis basis_i; 14309ee499e5SJeremy L Thompson 14319ee499e5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 14329ee499e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 143345a787f7SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 14349ee499e5SJeremy L Thompson CeedEvalMode eval_mode_j; 14359ee499e5SJeremy L Thompson CeedBasis basis_j; 14369ee499e5SJeremy L Thompson 14379ee499e5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 14389ee499e5SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 14399ee499e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 14409ee499e5SJeremy L Thompson if (basis_i == basis_j) { 14419ee499e5SJeremy L Thompson if (is_tensor) { 144245a787f7SJeremy L Thompson output_matrix_reuse[i].index = j; 144345a787f7SJeremy L Thompson output_matrix_reuse[i].is_input = true; 144445a787f7SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 14459ee499e5SJeremy L Thompson } else { 14469ee499e5SJeremy L Thompson // For non-tensor can only re-use with the same eval mode 14479ee499e5SJeremy L Thompson if (eval_mode_i == eval_mode_j) { 144845a787f7SJeremy L Thompson output_matrix_reuse[i].index = j; 144945a787f7SJeremy L Thompson output_matrix_reuse[i].is_input = true; 145045a787f7SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 14519ee499e5SJeremy L Thompson } 14529ee499e5SJeremy L Thompson } 14539ee499e5SJeremy L Thompson } 14549ee499e5SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 14559ee499e5SJeremy L Thompson } 145645a787f7SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 14579ee499e5SJeremy L Thompson CeedEvalMode eval_mode_j; 14589ee499e5SJeremy L Thompson CeedBasis basis_j; 14599ee499e5SJeremy L Thompson 14609ee499e5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 14619ee499e5SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 14629ee499e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 146374398b5aSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 14649ee499e5SJeremy L Thompson if (basis_i == basis_j) { 14659ee499e5SJeremy L Thompson if (is_tensor) { 146645a787f7SJeremy L Thompson output_matrix_reuse[i].index = j; 146745a787f7SJeremy L Thompson output_matrix_reuse[i].is_input = false; 146845a787f7SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 14699ee499e5SJeremy L Thompson } else { 14709ee499e5SJeremy L Thompson // For non-tensor can only re-use with the same eval mode 14719ee499e5SJeremy L Thompson if (eval_mode_i == eval_mode_j) { 147245a787f7SJeremy L Thompson output_matrix_reuse[i].index = j; 147345a787f7SJeremy L Thompson output_matrix_reuse[i].is_input = false; 147445a787f7SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 14759ee499e5SJeremy L Thompson } 14769ee499e5SJeremy L Thompson } 14779ee499e5SJeremy L Thompson } 14789ee499e5SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 14799ee499e5SJeremy L Thompson } 14809ee499e5SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 14819ee499e5SJeremy L Thompson } 14829ee499e5SJeremy L Thompson 14837d8d0e25Snbeams // Initialize constants, and matrices B and G 14840183ed61SJeremy L Thompson code << "\n" << tab << "// Input field constants and basis data\n"; 14859e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 14860183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 1487ca1da9b9SJeremy L Thompson max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); 14887d8d0e25Snbeams } 14890183ed61SJeremy L Thompson code << "\n" << tab << "// Output field constants and basis data\n"; 14909e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 14910183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 1492ca1da9b9SJeremy L Thompson max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); 14934b3e95d5SJeremy L Thompson } 14947d8d0e25Snbeams 14954b3e95d5SJeremy L Thompson // Loop over all elements 14960183ed61SJeremy L Thompson code << "\n" << tab << "// Element loop\n"; 14970183ed61SJeremy L Thompson code << tab << "__syncthreads();\n"; 14980183ed61SJeremy L Thompson code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 14990183ed61SJeremy L Thompson tab.push(); 15004b3e95d5SJeremy L Thompson 1501e93651e5SJeremy L Thompson // -- Compute minimum buffer space needed 15023a2968d6SJeremy L Thompson CeedInt max_rstr_buffer_size = 1; 1503e93651e5SJeremy L Thompson 1504e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 15056de40545SJeremy L Thompson CeedEvalMode eval_mode; 15066de40545SJeremy L Thompson 15076de40545SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 15086de40545SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 1509a61b1c91SJeremy L Thompson CeedInt num_comp; 1510e93651e5SJeremy L Thompson CeedElemRestriction elem_rstr; 1511e93651e5SJeremy L Thompson 1512e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 1513e93651e5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1514a61b1c91SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1515681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1516e93651e5SJeremy L Thompson } 15176de40545SJeremy L Thompson } 1518e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 15196de40545SJeremy L Thompson CeedEvalMode eval_mode; 15206de40545SJeremy L Thompson 15216de40545SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 15226de40545SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE) { 1523a61b1c91SJeremy L Thompson CeedInt num_comp; 1524e93651e5SJeremy L Thompson CeedElemRestriction elem_rstr; 1525e93651e5SJeremy L Thompson 1526e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1527e93651e5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1528a61b1c91SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1529681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1530e93651e5SJeremy L Thompson } 15316de40545SJeremy L Thompson } 15320183ed61SJeremy L Thompson code << tab << "// Scratch restriction buffer space\n"; 15330183ed61SJeremy L Thompson code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 1534e93651e5SJeremy L Thompson 1535e93651e5SJeremy L Thompson // -- Determine best input field processing order 1536e93651e5SJeremy L Thompson CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 1537e93651e5SJeremy L Thompson 1538e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1539e93651e5SJeremy L Thompson field_rstr_in_buffer[i] = -1; 1540e93651e5SJeremy L Thompson input_field_order[i] = -1; 1541e93651e5SJeremy L Thompson } 1542e93651e5SJeremy L Thompson { 1543e93651e5SJeremy L Thompson bool is_ordered[CEED_FIELD_MAX]; 1544e93651e5SJeremy L Thompson CeedInt curr_index = 0; 1545e93651e5SJeremy L Thompson 1546e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 1547e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1548e93651e5SJeremy L Thompson CeedVector vec_i; 1549e93651e5SJeremy L Thompson CeedElemRestriction rstr_i; 1550e93651e5SJeremy L Thompson 1551e93651e5SJeremy L Thompson if (is_ordered[i]) continue; 1552e93651e5SJeremy L Thompson field_rstr_in_buffer[i] = i; 1553e93651e5SJeremy L Thompson is_ordered[i] = true; 1554e93651e5SJeremy L Thompson input_field_order[curr_index] = i; 1555e93651e5SJeremy L Thompson curr_index++; 1556034f99fdSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 1557e93651e5SJeremy L Thompson if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 1558e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 1559e93651e5SJeremy L Thompson for (CeedInt j = i + 1; j < num_input_fields; j++) { 1560e93651e5SJeremy L Thompson CeedVector vec_j; 1561e93651e5SJeremy L Thompson CeedElemRestriction rstr_j; 1562e93651e5SJeremy L Thompson 1563e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 1564e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 1565e93651e5SJeremy L Thompson if (rstr_i == rstr_j && vec_i == vec_j) { 1566e93651e5SJeremy L Thompson field_rstr_in_buffer[j] = i; 1567e93651e5SJeremy L Thompson is_ordered[j] = true; 1568e93651e5SJeremy L Thompson input_field_order[curr_index] = j; 1569e93651e5SJeremy L Thompson curr_index++; 1570e93651e5SJeremy L Thompson } 15713a2968d6SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_j)); 15723a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 1573e93651e5SJeremy L Thompson } 15743a2968d6SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_i)); 15753a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 1576e93651e5SJeremy L Thompson } 1577e93651e5SJeremy L Thompson } 1578e93651e5SJeremy L Thompson 15794b3e95d5SJeremy L Thompson // -- Input restriction and basis 15800183ed61SJeremy L Thompson code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 15819e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 158259fa3f92SJeremy L Thompson const char *field_name; 158359fa3f92SJeremy L Thompson const CeedInt f = input_field_order[i]; 1584e93651e5SJeremy L Thompson 158559fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 15860183ed61SJeremy L Thompson code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 15877d8d0e25Snbeams 15884b3e95d5SJeremy L Thompson // ---- Restriction 15890183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 15900183ed61SJeremy L Thompson max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 1591b7453713SJeremy L Thompson 15924b3e95d5SJeremy L Thompson // ---- Basis action 15930183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 15940183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 15957d8d0e25Snbeams } 15967d8d0e25Snbeams 15974b3e95d5SJeremy L Thompson // -- Q function 15980183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 15990183ed61SJeremy L Thompson qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 1600*745f16d1SZach Atkins Q_1d, is_all_tensor, is_at_points, use_3d_slices, false)); 16017d8d0e25Snbeams 16024b3e95d5SJeremy L Thompson // -- Output basis and restriction 16030183ed61SJeremy L Thompson code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 16049e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 160559fa3f92SJeremy L Thompson const char *field_name; 160659fa3f92SJeremy L Thompson 160759fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 16080183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1609b7453713SJeremy L Thompson 16104b3e95d5SJeremy L Thompson // ---- Basis action 16110183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, 16120183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 16137d8d0e25Snbeams 16144b3e95d5SJeremy L Thompson // ---- Restriction 16150183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, 16160183ed61SJeremy L Thompson false, is_all_tensor, is_at_points, use_3d_slices)); 16177d8d0e25Snbeams } 16187d8d0e25Snbeams 16194b3e95d5SJeremy L Thompson // Close loop and function 16200183ed61SJeremy L Thompson tab.pop(); 16210183ed61SJeremy L Thompson code << tab << "}\n"; 16220183ed61SJeremy L Thompson tab.pop(); 16230183ed61SJeremy L Thompson code << tab << "}\n"; 16240183ed61SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n\n"; 16257d8d0e25Snbeams 1626539ec17dSJeremy L Thompson CeedInt block_sizes[3] = {0, 0, 0}; 16279e201c85SYohann CeedInt num_elem; 1628b7453713SJeremy L Thompson 16293a2968d6SJeremy L Thompson // Compile 16302b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 163174398b5aSJeremy L Thompson CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes)); 16328d12f40eSJeremy L Thompson { 16338d12f40eSJeremy L Thompson bool is_compile_good = false; 16348d12f40eSJeremy L Thompson 1635a61b1c91SJeremy L Thompson data->thread_1d = block_sizes[0]; 16366b92dc4bSJeremy L Thompson CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE", 16372b730f8bSJeremy L Thompson block_sizes[0] * block_sizes[1] * block_sizes[2])); 16388d12f40eSJeremy L Thompson if (is_compile_good) { 16398d12f40eSJeremy L Thompson *is_good_build = true; 1640eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op)); 16418d12f40eSJeremy L Thompson } else { 16428d12f40eSJeremy L Thompson *is_good_build = false; 16438d12f40eSJeremy L Thompson data->use_fallback = true; 16448d12f40eSJeremy L Thompson } 16458d12f40eSJeremy L Thompson } 16462b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetSetupDone(op)); 16479bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 1648c11e12f4SJeremy L Thompson CeedCallBackend(CeedQFunctionDestroy(&qf)); 1649e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 16507d8d0e25Snbeams } 16512a86cc9dSSebastian Grimberg 16527d8d0e25Snbeams //------------------------------------------------------------------------------ 16530183ed61SJeremy L Thompson // Build AtPoints assembly operator kernel 16540183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 16550183ed61SJeremy L Thompson static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) { 16560183ed61SJeremy L Thompson bool is_all_tensor = true, is_at_points = false, use_3d_slices = false; 16570183ed61SJeremy L Thompson Ceed ceed; 16580183ed61SJeremy L Thompson CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; 16590183ed61SJeremy L Thompson CeedQFunctionField *qf_input_fields, *qf_output_fields; 16600183ed61SJeremy L Thompson CeedQFunction_Hip_gen *qf_data; 16610183ed61SJeremy L Thompson CeedQFunction qf; 16620183ed61SJeremy L Thompson CeedOperatorField *op_input_fields, *op_output_fields; 16630183ed61SJeremy L Thompson CeedOperator_Hip_gen *data; 16640183ed61SJeremy L Thompson std::ostringstream code; 16650183ed61SJeremy L Thompson Tab tab; 16660183ed61SJeremy L Thompson 16670183ed61SJeremy L Thompson // Check compatibility 16680183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 16690183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 16700183ed61SJeremy L Thompson CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported"); 16710183ed61SJeremy L Thompson 16720183ed61SJeremy L Thompson // Retrieve operator data 16730183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &data)); 16740183ed61SJeremy L Thompson Q = data->Q; 16750183ed61SJeremy L Thompson Q_1d = data->Q_1d; 16760183ed61SJeremy L Thompson max_dim = data->dim; 16770183ed61SJeremy L Thompson { 16780183ed61SJeremy L Thompson CeedElemRestriction rstr_points = NULL; 16790183ed61SJeremy L Thompson 16800183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 16810183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 16820183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 16830183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 16840183ed61SJeremy L Thompson } 16850183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 16860183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 16870183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 16880183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 16890183ed61SJeremy L Thompson 16900183ed61SJeremy L Thompson // Load basis source files 16910183ed61SJeremy L Thompson code << tab << "// Tensor basis source\n"; 16920183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 16930183ed61SJeremy L Thompson code << tab << "// AtPoints basis source\n"; 16940183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n"; 16950183ed61SJeremy L Thompson code << tab << "// CodeGen operator source\n"; 16960183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 16970183ed61SJeremy L Thompson 16980183ed61SJeremy L Thompson // Get QFunction name 16990183ed61SJeremy L Thompson std::string qfunction_name(qf_data->qfunction_name); 17000183ed61SJeremy L Thompson std::string operator_name; 17010183ed61SJeremy L Thompson 17020183ed61SJeremy L Thompson if (is_full) { 17030183ed61SJeremy L Thompson operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name; 17040183ed61SJeremy L Thompson } else { 17050183ed61SJeremy L Thompson operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name; 17060183ed61SJeremy L Thompson } 17070183ed61SJeremy L Thompson 17080183ed61SJeremy L Thompson // Define CEED_Q_VLA 17090183ed61SJeremy L Thompson code << "\n" << tab << "#undef CEED_Q_VLA\n"; 17100183ed61SJeremy L Thompson code << tab << "#define CEED_Q_VLA 1\n\n"; 17110183ed61SJeremy L Thompson 17120183ed61SJeremy L Thompson // Add user QFunction source 17130183ed61SJeremy L Thompson { 17140183ed61SJeremy L Thompson const char *source_path; 17150183ed61SJeremy L Thompson 17160183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 17170183ed61SJeremy L Thompson CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 17180183ed61SJeremy L Thompson 17190183ed61SJeremy L Thompson code << tab << "// User QFunction source\n"; 17200183ed61SJeremy L Thompson code << tab << "#include \"" << source_path << "\"\n\n"; 17210183ed61SJeremy L Thompson } 17220183ed61SJeremy L Thompson 17230183ed61SJeremy L Thompson // Setup 17240183ed61SJeremy L Thompson code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 17250183ed61SJeremy L Thompson code << tab << "// Operator Assembly Kernel\n"; 17260183ed61SJeremy L Thompson code << tab << "// \n"; 17270183ed61SJeremy L Thompson code << tab << "// d_[in,out]_i: CeedVector device array\n"; 17280183ed61SJeremy L Thompson code << tab << "// r_[in,out]_e_i: Element vector register\n"; 17290183ed61SJeremy L Thompson code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 17300183ed61SJeremy L Thompson code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 17310183ed61SJeremy L Thompson code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 17320183ed61SJeremy L Thompson code << tab << "// \n"; 17330183ed61SJeremy L Thompson code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 17340183ed61SJeremy L Thompson code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 17350183ed61SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n"; 17360183ed61SJeremy L Thompson code << tab << "extern \"C\" __global__ void " << operator_name 17370183ed61SJeremy L Thompson << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip " 17380183ed61SJeremy L Thompson "points, CeedScalar *__restrict__ values_array) {\n"; 17390183ed61SJeremy L Thompson tab.push(); 17400183ed61SJeremy L Thompson 17410183ed61SJeremy L Thompson // Scratch buffers 17420183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 17430183ed61SJeremy L Thompson CeedEvalMode eval_mode; 17440183ed61SJeremy L Thompson 17450183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 17460183ed61SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 17470183ed61SJeremy L Thompson code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 17480183ed61SJeremy L Thompson } 17490183ed61SJeremy L Thompson } 17500183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 17510183ed61SJeremy L Thompson code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 17520183ed61SJeremy L Thompson } 17530183ed61SJeremy L Thompson 17540183ed61SJeremy L Thompson code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 17550183ed61SJeremy L Thompson code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 17560183ed61SJeremy L Thompson code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; 17570183ed61SJeremy L Thompson code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 17580183ed61SJeremy L Thompson 17590183ed61SJeremy L Thompson // Shared data 17600183ed61SJeremy L Thompson code << tab << "extern __shared__ CeedScalar slice[];\n"; 17610183ed61SJeremy L Thompson code << tab << "SharedData_Hip data;\n"; 17620183ed61SJeremy L Thompson code << tab << "data.t_id_x = threadIdx.x;\n"; 17630183ed61SJeremy L Thompson code << tab << "data.t_id_y = threadIdx.y;\n"; 17640183ed61SJeremy L Thompson code << tab << "data.t_id_z = threadIdx.z;\n"; 17650183ed61SJeremy L Thompson code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 17660183ed61SJeremy L Thompson code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 17670183ed61SJeremy L Thompson 17680183ed61SJeremy L Thompson // -- Determine input mat reuse 17690183ed61SJeremy L Thompson FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 17700183ed61SJeremy L Thompson 17710183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 17720183ed61SJeremy L Thompson input_matrix_reuse[i].index = -1; 17730183ed61SJeremy L Thompson } 17740183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 17750183ed61SJeremy L Thompson CeedEvalMode eval_mode_i; 17760183ed61SJeremy L Thompson CeedBasis basis_i; 17770183ed61SJeremy L Thompson 17780183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 17790183ed61SJeremy L Thompson if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 17800183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 17810183ed61SJeremy L Thompson for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 17820183ed61SJeremy L Thompson CeedEvalMode eval_mode_j; 17830183ed61SJeremy L Thompson CeedBasis basis_j; 17840183ed61SJeremy L Thompson 17850183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 17860183ed61SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 17870183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 17880183ed61SJeremy L Thompson if (basis_i == basis_j) { 17890183ed61SJeremy L Thompson input_matrix_reuse[i].index = j; 17900183ed61SJeremy L Thompson input_matrix_reuse[i].is_input = true; 17910183ed61SJeremy L Thompson input_matrix_reuse[i].eval_mode = eval_mode_j; 17920183ed61SJeremy L Thompson } 17930183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 17940183ed61SJeremy L Thompson } 17950183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 17960183ed61SJeremy L Thompson } 17970183ed61SJeremy L Thompson 17980183ed61SJeremy L Thompson // -- Determine output mat reuse 17990183ed61SJeremy L Thompson FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 18000183ed61SJeremy L Thompson 18010183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 18020183ed61SJeremy L Thompson output_matrix_reuse[i].index = -1; 18030183ed61SJeremy L Thompson } 18040183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 18050183ed61SJeremy L Thompson CeedEvalMode eval_mode_i; 18060183ed61SJeremy L Thompson CeedBasis basis_i; 18070183ed61SJeremy L Thompson 18080183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 18090183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 18100183ed61SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 18110183ed61SJeremy L Thompson CeedEvalMode eval_mode_j; 18120183ed61SJeremy L Thompson CeedBasis basis_j; 18130183ed61SJeremy L Thompson 18140183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 18150183ed61SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 18160183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 18170183ed61SJeremy L Thompson if (basis_i == basis_j) { 18180183ed61SJeremy L Thompson output_matrix_reuse[i].index = j; 18190183ed61SJeremy L Thompson output_matrix_reuse[i].is_input = true; 18200183ed61SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 18210183ed61SJeremy L Thompson } 18220183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 18230183ed61SJeremy L Thompson } 18240183ed61SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 18250183ed61SJeremy L Thompson CeedEvalMode eval_mode_j; 18260183ed61SJeremy L Thompson CeedBasis basis_j; 18270183ed61SJeremy L Thompson 18280183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 18290183ed61SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 18300183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 18310183ed61SJeremy L Thompson if (basis_i == basis_j) { 18320183ed61SJeremy L Thompson output_matrix_reuse[i].index = j; 18330183ed61SJeremy L Thompson output_matrix_reuse[i].is_input = false; 18340183ed61SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 18350183ed61SJeremy L Thompson } 18360183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 18370183ed61SJeremy L Thompson } 18380183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 18390183ed61SJeremy L Thompson } 18400183ed61SJeremy L Thompson 18410183ed61SJeremy L Thompson // Initialize constants, and matrices B and G 18420183ed61SJeremy L Thompson code << "\n" << tab << "// Input field constants and basis data\n"; 18430183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 18440183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 1845ca1da9b9SJeremy L Thompson max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); 18460183ed61SJeremy L Thompson } 18470183ed61SJeremy L Thompson code << "\n" << tab << "// Output field constants and basis data\n"; 18480183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 18490183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 1850ca1da9b9SJeremy L Thompson max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); 18510183ed61SJeremy L Thompson } 18520183ed61SJeremy L Thompson 18530183ed61SJeremy L Thompson // Loop over all elements 18540183ed61SJeremy L Thompson code << "\n" << tab << "// Element loop\n"; 18550183ed61SJeremy L Thompson code << tab << "__syncthreads();\n"; 18560183ed61SJeremy L Thompson code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 18570183ed61SJeremy L Thompson tab.push(); 18580183ed61SJeremy L Thompson 18590183ed61SJeremy L Thompson // -- Compute minimum buffer space needed 18600183ed61SJeremy L Thompson CeedInt max_rstr_buffer_size = 1; 18610183ed61SJeremy L Thompson 18620183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 18630183ed61SJeremy L Thompson CeedEvalMode eval_mode; 18640183ed61SJeremy L Thompson 18650183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 18660183ed61SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 18670183ed61SJeremy L Thompson CeedInt num_comp; 18680183ed61SJeremy L Thompson CeedElemRestriction elem_rstr; 18690183ed61SJeremy L Thompson 18700183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 18710183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 18720183ed61SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 18730183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 18740183ed61SJeremy L Thompson } 18750183ed61SJeremy L Thompson } 18760183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 18770183ed61SJeremy L Thompson CeedEvalMode eval_mode; 18780183ed61SJeremy L Thompson 18790183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 18800183ed61SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE) { 18810183ed61SJeremy L Thompson CeedInt num_comp; 18820183ed61SJeremy L Thompson CeedElemRestriction elem_rstr; 18830183ed61SJeremy L Thompson 18840183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 18850183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 18860183ed61SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 18870183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 18880183ed61SJeremy L Thompson } 18890183ed61SJeremy L Thompson } 18900183ed61SJeremy L Thompson code << tab << "// Scratch restriction buffer space\n"; 18910183ed61SJeremy L Thompson code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 18920183ed61SJeremy L Thompson 18930183ed61SJeremy L Thompson // -- Determine best input field processing order 18940183ed61SJeremy L Thompson CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 18950183ed61SJeremy L Thompson 18960183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 18970183ed61SJeremy L Thompson field_rstr_in_buffer[i] = -1; 18980183ed61SJeremy L Thompson input_field_order[i] = -1; 18990183ed61SJeremy L Thompson } 19000183ed61SJeremy L Thompson { 19010183ed61SJeremy L Thompson bool is_ordered[CEED_FIELD_MAX]; 19020183ed61SJeremy L Thompson CeedInt curr_index = 0; 19030183ed61SJeremy L Thompson 19040183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 19050183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 19060183ed61SJeremy L Thompson CeedVector vec_i; 19070183ed61SJeremy L Thompson CeedElemRestriction rstr_i; 19080183ed61SJeremy L Thompson 19090183ed61SJeremy L Thompson if (is_ordered[i]) continue; 19100183ed61SJeremy L Thompson field_rstr_in_buffer[i] = i; 19110183ed61SJeremy L Thompson is_ordered[i] = true; 19120183ed61SJeremy L Thompson input_field_order[curr_index] = i; 19130183ed61SJeremy L Thompson curr_index++; 19140183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 19150183ed61SJeremy L Thompson if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 19160183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 19170183ed61SJeremy L Thompson for (CeedInt j = i + 1; j < num_input_fields; j++) { 19180183ed61SJeremy L Thompson CeedVector vec_j; 19190183ed61SJeremy L Thompson CeedElemRestriction rstr_j; 19200183ed61SJeremy L Thompson 19210183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 19220183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 19230183ed61SJeremy L Thompson if (rstr_i == rstr_j && vec_i == vec_j) { 19240183ed61SJeremy L Thompson field_rstr_in_buffer[j] = i; 19250183ed61SJeremy L Thompson is_ordered[j] = true; 19260183ed61SJeremy L Thompson input_field_order[curr_index] = j; 19270183ed61SJeremy L Thompson curr_index++; 19280183ed61SJeremy L Thompson } 19290183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_j)); 19300183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 19310183ed61SJeremy L Thompson } 19320183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_i)); 19330183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 19340183ed61SJeremy L Thompson } 19350183ed61SJeremy L Thompson } 19360183ed61SJeremy L Thompson 19370183ed61SJeremy L Thompson // -- Input restriction and basis 19380183ed61SJeremy L Thompson code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 19390183ed61SJeremy L Thompson CeedInt active_field_index = -1; 19400183ed61SJeremy L Thompson 19410183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 19420183ed61SJeremy L Thompson bool is_active = false; 19430183ed61SJeremy L Thompson const char *field_name; 19440183ed61SJeremy L Thompson const CeedInt f = input_field_order[i]; 19450183ed61SJeremy L Thompson 19460183ed61SJeremy L Thompson { 19470183ed61SJeremy L Thompson CeedVector vec; 19480183ed61SJeremy L Thompson 19490183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 19500183ed61SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 19510183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 19520183ed61SJeremy L Thompson } 19530183ed61SJeremy L Thompson 19540183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 19550183ed61SJeremy L Thompson code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 19560183ed61SJeremy L Thompson 19570183ed61SJeremy L Thompson if (is_active) { 19580183ed61SJeremy L Thompson std::string var_suffix = "_in_" + std::to_string(f); 19590183ed61SJeremy L Thompson 19600183ed61SJeremy L Thompson code << tab << "// Active field - no restriction or basis action here\n"; 19610183ed61SJeremy L Thompson if (active_field_index == -1) { 19620183ed61SJeremy L Thompson active_field_index = f; 19630183ed61SJeremy L Thompson code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1") 19640183ed61SJeremy L Thompson << "] = {0.0};\n"; 19650183ed61SJeremy L Thompson } else { 19660183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n"; 19670183ed61SJeremy L Thompson } 19680183ed61SJeremy L Thompson } else { 19690183ed61SJeremy L Thompson // ---- Restriction 19700183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 19710183ed61SJeremy L Thompson max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 19720183ed61SJeremy L Thompson 19730183ed61SJeremy L Thompson // ---- Basis action 19740183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 19750183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 19760183ed61SJeremy L Thompson } 19770183ed61SJeremy L Thompson } 19780183ed61SJeremy L Thompson 19790183ed61SJeremy L Thompson // -- Loop over active field 19800183ed61SJeremy L Thompson std::string active_var_suffix = "_in_" + std::to_string(active_field_index); 19810183ed61SJeremy L Thompson 19820183ed61SJeremy L Thompson code << "\n" << tab << "// Loop over nodes in active field\n"; 19830183ed61SJeremy L Thompson code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix 19840183ed61SJeremy L Thompson << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n"; 19850183ed61SJeremy L Thompson tab.push(); 19860183ed61SJeremy L Thompson 19870183ed61SJeremy L Thompson // -- Set current active node and component to 1 19880183ed61SJeremy L Thompson code << tab << "// Set current active node and component to 1.0\n"; 19890183ed61SJeremy L Thompson code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e" 19900183ed61SJeremy L Thompson << active_var_suffix << ");\n\n"; 19910183ed61SJeremy L Thompson 19920183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 19930183ed61SJeremy L Thompson bool is_active = false; 19940183ed61SJeremy L Thompson const char *field_name; 19950183ed61SJeremy L Thompson const CeedInt f = input_field_order[i]; 19960183ed61SJeremy L Thompson 19970183ed61SJeremy L Thompson { 19980183ed61SJeremy L Thompson CeedVector vec; 19990183ed61SJeremy L Thompson 20000183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 20010183ed61SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 20020183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 20030183ed61SJeremy L Thompson } 20040183ed61SJeremy L Thompson if (!is_active) continue; 20050183ed61SJeremy L Thompson 20060183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 20070183ed61SJeremy L Thompson code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 20080183ed61SJeremy L Thompson 20090183ed61SJeremy L Thompson // ---- Basis action 20100183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 20110183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 20120183ed61SJeremy L Thompson } 20130183ed61SJeremy L Thompson 20140183ed61SJeremy L Thompson // -- Q function 20150183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 20160183ed61SJeremy L Thompson qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 2017*745f16d1SZach Atkins Q_1d, is_all_tensor, is_at_points, use_3d_slices, true)); 20180183ed61SJeremy L Thompson 20190183ed61SJeremy L Thompson // -- Output basis and restriction 20200183ed61SJeremy L Thompson code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 20210183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 20220183ed61SJeremy L Thompson bool is_active = false; 20230183ed61SJeremy L Thompson const char *field_name; 20240183ed61SJeremy L Thompson 20250183ed61SJeremy L Thompson { 20260183ed61SJeremy L Thompson CeedVector vec; 20270183ed61SJeremy L Thompson 20280183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 20290183ed61SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 20300183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 20310183ed61SJeremy L Thompson } 20320183ed61SJeremy L Thompson if (!is_active) continue; 20330183ed61SJeremy L Thompson 20340183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 20350183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 20360183ed61SJeremy L Thompson 20370183ed61SJeremy L Thompson // ---- Basis action 20380183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, 20390183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 20400183ed61SJeremy L Thompson 20410183ed61SJeremy L Thompson // ---- Restriction 20420183ed61SJeremy L Thompson if (is_full) { 2043692716b7SZach Atkins std::string var_suffix = "_out_" + std::to_string(i); 2044692716b7SZach Atkins CeedInt comp_stride; 2045692716b7SZach Atkins CeedSize l_size; 2046692716b7SZach Atkins CeedElemRestriction elem_rstr; 2047692716b7SZach Atkins 2048692716b7SZach Atkins CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 2049692716b7SZach Atkins CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 2050692716b7SZach Atkins code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 2051692716b7SZach Atkins CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 2052692716b7SZach Atkins code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 2053692716b7SZach Atkins code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix 2054692716b7SZach Atkins << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n"; 2055692716b7SZach Atkins CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 20560183ed61SJeremy L Thompson } else { 20570183ed61SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 20580183ed61SJeremy L Thompson CeedInt comp_stride; 20590183ed61SJeremy L Thompson CeedSize l_size; 20600183ed61SJeremy L Thompson CeedElemRestriction elem_rstr; 20610183ed61SJeremy L Thompson 20620183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 20630183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 20640183ed61SJeremy L Thompson code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 20650183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 20660183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 20670183ed61SJeremy L Thompson code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix 20680183ed61SJeremy L Thompson << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n"; 20690183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 20700183ed61SJeremy L Thompson } 20710183ed61SJeremy L Thompson } 20720183ed61SJeremy L Thompson 20730183ed61SJeremy L Thompson // -- Reset current active node and component 20740183ed61SJeremy L Thompson code << "\n" << tab << "// Reset current active node and component to 0.0\n"; 20750183ed61SJeremy L Thompson code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e" 20760183ed61SJeremy L Thompson << active_var_suffix << ");\n"; 20770183ed61SJeremy L Thompson 20780183ed61SJeremy L Thompson // -- End of loop over active field 20790183ed61SJeremy L Thompson tab.pop(); 20800183ed61SJeremy L Thompson code << tab << "}\n"; 20810183ed61SJeremy L Thompson 20820183ed61SJeremy L Thompson // Close loop and function 20830183ed61SJeremy L Thompson tab.pop(); 20840183ed61SJeremy L Thompson code << tab << "}\n"; 20850183ed61SJeremy L Thompson tab.pop(); 20860183ed61SJeremy L Thompson code << tab << "}\n"; 20870183ed61SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n\n"; 20880183ed61SJeremy L Thompson 20890183ed61SJeremy L Thompson CeedInt block_sizes[3] = {0, 0, 0}; 20900183ed61SJeremy L Thompson CeedInt num_elem; 20910183ed61SJeremy L Thompson 20920183ed61SJeremy L Thompson // Compile 20930183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 20940183ed61SJeremy L Thompson CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); 20950183ed61SJeremy L Thompson { 20960183ed61SJeremy L Thompson bool is_compile_good = false; 20970183ed61SJeremy L Thompson 20980183ed61SJeremy L Thompson data->thread_1d = block_sizes[0]; 20990183ed61SJeremy L Thompson CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, 21000183ed61SJeremy L Thompson is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0], 21010183ed61SJeremy L Thompson "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2])); 21020183ed61SJeremy L Thompson if (is_compile_good) { 21030183ed61SJeremy L Thompson *is_good_build = true; 21040183ed61SJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(), 21050183ed61SJeremy L Thompson is_full ? &data->assemble_full : &data->assemble_diagonal)); 21060183ed61SJeremy L Thompson } else { 21070183ed61SJeremy L Thompson *is_good_build = false; 21080183ed61SJeremy L Thompson data->use_assembly_fallback = true; 21090183ed61SJeremy L Thompson } 21100183ed61SJeremy L Thompson } 21110183ed61SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 21120183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionDestroy(&qf)); 21130183ed61SJeremy L Thompson return CEED_ERROR_SUCCESS; 21140183ed61SJeremy L Thompson } 21150183ed61SJeremy L Thompson 21160183ed61SJeremy L Thompson extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) { 21170183ed61SJeremy L Thompson return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build); 21180183ed61SJeremy L Thompson } 21190183ed61SJeremy L Thompson 2120692716b7SZach Atkins extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) { 2121692716b7SZach Atkins return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build); 2122692716b7SZach Atkins } 21235daefc96SJeremy L Thompson //------------------------------------------------------------------------------ 21245daefc96SJeremy L Thompson // Build QFunction assembly operator kernel 21255daefc96SJeremy L Thompson //------------------------------------------------------------------------------ 21265daefc96SJeremy L Thompson extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) { 21275daefc96SJeremy L Thompson bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; 21285daefc96SJeremy L Thompson Ceed ceed; 21295daefc96SJeremy L Thompson CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0; 21305daefc96SJeremy L Thompson CeedQFunctionField *qf_input_fields, *qf_output_fields; 21315daefc96SJeremy L Thompson CeedQFunction_Hip_gen *qf_data; 21325daefc96SJeremy L Thompson CeedQFunction qf; 21335daefc96SJeremy L Thompson CeedOperatorField *op_input_fields, *op_output_fields; 21345daefc96SJeremy L Thompson CeedOperator_Hip_gen *data; 21355daefc96SJeremy L Thompson std::ostringstream code; 21365daefc96SJeremy L Thompson Tab tab; 21375daefc96SJeremy L Thompson 21385daefc96SJeremy L Thompson // Check compatibility 21395daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 21405daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 21415daefc96SJeremy L Thompson CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported"); 21425daefc96SJeremy L Thompson 21435daefc96SJeremy L Thompson // Check field compatibility 21445daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 21455daefc96SJeremy L Thompson { 21465daefc96SJeremy L Thompson bool has_shared_bases = true; 21475daefc96SJeremy L Thompson 21485daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 21495daefc96SJeremy L Thompson CeedBasis basis; 21505daefc96SJeremy L Thompson 21515daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 21525daefc96SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 21535daefc96SJeremy L Thompson bool is_tensor = true; 21545daefc96SJeremy L Thompson const char *resource; 21555daefc96SJeremy L Thompson char *resource_root; 21565daefc96SJeremy L Thompson Ceed basis_ceed; 21575daefc96SJeremy L Thompson 21585daefc96SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 21595daefc96SJeremy L Thompson is_all_tensor = is_all_tensor && is_tensor; 21605daefc96SJeremy L Thompson is_all_nontensor = is_all_nontensor && !is_tensor; 21615daefc96SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 21625daefc96SJeremy L Thompson CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 21635daefc96SJeremy L Thompson CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 21645daefc96SJeremy L Thompson has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 21655daefc96SJeremy L Thompson CeedCallBackend(CeedFree(&resource_root)); 21665daefc96SJeremy L Thompson CeedCallBackend(CeedDestroy(&basis_ceed)); 21675daefc96SJeremy L Thompson } 21685daefc96SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 21695daefc96SJeremy L Thompson } 21705daefc96SJeremy L Thompson 21715daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 21725daefc96SJeremy L Thompson CeedBasis basis; 21735daefc96SJeremy L Thompson 21745daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 21755daefc96SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 21765daefc96SJeremy L Thompson bool is_tensor = true; 21775daefc96SJeremy L Thompson const char *resource; 21785daefc96SJeremy L Thompson char *resource_root; 21795daefc96SJeremy L Thompson Ceed basis_ceed; 21805daefc96SJeremy L Thompson 21815daefc96SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 21825daefc96SJeremy L Thompson is_all_tensor = is_all_tensor && is_tensor; 21835daefc96SJeremy L Thompson is_all_nontensor = is_all_nontensor && !is_tensor; 21845daefc96SJeremy L Thompson 21855daefc96SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 21865daefc96SJeremy L Thompson CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 21875daefc96SJeremy L Thompson CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 21885daefc96SJeremy L Thompson has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 21895daefc96SJeremy L Thompson CeedCallBackend(CeedFree(&resource_root)); 21905daefc96SJeremy L Thompson CeedCallBackend(CeedDestroy(&basis_ceed)); 21915daefc96SJeremy L Thompson } 21925daefc96SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 21935daefc96SJeremy L Thompson } 21945daefc96SJeremy L Thompson } 21955daefc96SJeremy L Thompson 21965daefc96SJeremy L Thompson // Retrieve operator data 21975daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &data)); 21985daefc96SJeremy L Thompson Q = data->Q; 21995daefc96SJeremy L Thompson Q_1d = data->Q_1d; 22005daefc96SJeremy L Thompson max_dim = data->dim; 22015daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 22025daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 22035daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 22045daefc96SJeremy L Thompson 22055daefc96SJeremy L Thompson // Load basis source files 22065daefc96SJeremy L Thompson if (!is_all_nontensor) { 22075daefc96SJeremy L Thompson code << tab << "// Tensor basis source\n"; 22085daefc96SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 22095daefc96SJeremy L Thompson } 22105daefc96SJeremy L Thompson if (!is_all_tensor) { 22115daefc96SJeremy L Thompson code << tab << "// Non-tensor basis source\n"; 22125daefc96SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n"; 22135daefc96SJeremy L Thompson } 22145daefc96SJeremy L Thompson if (!is_all_tensor && !is_all_nontensor) { 22155daefc96SJeremy L Thompson code << "// Tensor basis source\n"; 22165daefc96SJeremy L Thompson code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n"; 22175daefc96SJeremy L Thompson } 22185daefc96SJeremy L Thompson code << "// CodeGen operator source\n"; 22195daefc96SJeremy L Thompson code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 22205daefc96SJeremy L Thompson 22215daefc96SJeremy L Thompson // Get QFunction name 22225daefc96SJeremy L Thompson std::string qfunction_name(qf_data->qfunction_name); 22235daefc96SJeremy L Thompson std::string operator_name; 22245daefc96SJeremy L Thompson 22255daefc96SJeremy L Thompson operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name; 22265daefc96SJeremy L Thompson 22275daefc96SJeremy L Thompson // Define CEED_Q_VLA 22285daefc96SJeremy L Thompson code << "\n" << tab << "#undef CEED_Q_VLA\n"; 22295daefc96SJeremy L Thompson if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 22305daefc96SJeremy L Thompson code << tab << "#define CEED_Q_VLA 1\n\n"; 22315daefc96SJeremy L Thompson } else { 22325daefc96SJeremy L Thompson code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 22335daefc96SJeremy L Thompson } 22345daefc96SJeremy L Thompson 22355daefc96SJeremy L Thompson // Add user QFunction source 22365daefc96SJeremy L Thompson { 22375daefc96SJeremy L Thompson const char *source_path; 22385daefc96SJeremy L Thompson 22395daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 22405daefc96SJeremy L Thompson CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 22415daefc96SJeremy L Thompson 22425daefc96SJeremy L Thompson code << tab << "// User QFunction source\n"; 22435daefc96SJeremy L Thompson code << tab << "#include \"" << source_path << "\"\n\n"; 22445daefc96SJeremy L Thompson } 22455daefc96SJeremy L Thompson 22465daefc96SJeremy L Thompson // Setup 22475daefc96SJeremy L Thompson code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 22485daefc96SJeremy L Thompson code << tab << "// Operator Assembly Kernel\n"; 22495daefc96SJeremy L Thompson code << tab << "// \n"; 22505daefc96SJeremy L Thompson code << tab << "// d_[in,out]_i: CeedVector device array\n"; 22515daefc96SJeremy L Thompson code << tab << "// r_[in,out]_e_i: Element vector register\n"; 22525daefc96SJeremy L Thompson code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 22535daefc96SJeremy L Thompson code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 22545daefc96SJeremy L Thompson code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 22555daefc96SJeremy L Thompson code << tab << "// \n"; 22565daefc96SJeremy L Thompson code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 22575daefc96SJeremy L Thompson code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 22585daefc96SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n"; 22595daefc96SJeremy L Thompson code << tab << "extern \"C\" __global__ void " << operator_name 22605daefc96SJeremy L Thompson << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip " 22615daefc96SJeremy L Thompson "points, CeedScalar *__restrict__ values_array) {\n"; 22625daefc96SJeremy L Thompson tab.push(); 22635daefc96SJeremy L Thompson 22645daefc96SJeremy L Thompson // Scratch buffers 22655daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 22665daefc96SJeremy L Thompson CeedEvalMode eval_mode; 22675daefc96SJeremy L Thompson 22685daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 22695daefc96SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 22705daefc96SJeremy L Thompson code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 22715daefc96SJeremy L Thompson } 22725daefc96SJeremy L Thompson } 22735daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 2274*745f16d1SZach Atkins bool is_active = false; 2275*745f16d1SZach Atkins 2276*745f16d1SZach Atkins { 2277*745f16d1SZach Atkins CeedVector vec; 2278*745f16d1SZach Atkins 2279*745f16d1SZach Atkins CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 2280*745f16d1SZach Atkins is_active = vec == CEED_VECTOR_ACTIVE; 2281*745f16d1SZach Atkins CeedCallBackend(CeedVectorDestroy(&vec)); 2282*745f16d1SZach Atkins } 2283*745f16d1SZach Atkins if (is_active) { 22845daefc96SJeremy L Thompson code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 22855daefc96SJeremy L Thompson } 2286*745f16d1SZach Atkins } 22875daefc96SJeremy L Thompson 22885daefc96SJeremy L Thompson code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 22895daefc96SJeremy L Thompson if (!is_all_tensor) { 22905daefc96SJeremy L Thompson code << tab << "const CeedInt Q = " << Q << ";\n"; 22915daefc96SJeremy L Thompson } 22925daefc96SJeremy L Thompson if (!is_all_nontensor) { 22935daefc96SJeremy L Thompson code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 22945daefc96SJeremy L Thompson } 22955daefc96SJeremy L Thompson 22965daefc96SJeremy L Thompson // Shared data 22975daefc96SJeremy L Thompson code << tab << "extern __shared__ CeedScalar slice[];\n"; 22985daefc96SJeremy L Thompson code << tab << "SharedData_Hip data;\n"; 22995daefc96SJeremy L Thompson code << tab << "data.t_id_x = threadIdx.x;\n"; 23005daefc96SJeremy L Thompson code << tab << "data.t_id_y = threadIdx.y;\n"; 23015daefc96SJeremy L Thompson code << tab << "data.t_id_z = threadIdx.z;\n"; 23025daefc96SJeremy L Thompson code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 23035daefc96SJeremy L Thompson code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 23045daefc96SJeremy L Thompson 23055daefc96SJeremy L Thompson // -- Determine input mat reuse 23065daefc96SJeremy L Thompson FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 23075daefc96SJeremy L Thompson 23085daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 23095daefc96SJeremy L Thompson input_matrix_reuse[i].index = -1; 23105daefc96SJeremy L Thompson } 23115daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 23125daefc96SJeremy L Thompson bool is_tensor = true; 23135daefc96SJeremy L Thompson CeedEvalMode eval_mode_i; 23145daefc96SJeremy L Thompson CeedBasis basis_i; 23155daefc96SJeremy L Thompson 23165daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 23175daefc96SJeremy L Thompson if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 23185daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 23195daefc96SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 23205daefc96SJeremy L Thompson for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 23215daefc96SJeremy L Thompson CeedEvalMode eval_mode_j; 23225daefc96SJeremy L Thompson CeedBasis basis_j; 23235daefc96SJeremy L Thompson 23245daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 23255daefc96SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 23265daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 23275daefc96SJeremy L Thompson if (basis_i == basis_j) { 23285daefc96SJeremy L Thompson if (is_tensor) { 23295daefc96SJeremy L Thompson input_matrix_reuse[i].index = j; 23305daefc96SJeremy L Thompson input_matrix_reuse[i].is_input = true; 23315daefc96SJeremy L Thompson input_matrix_reuse[i].eval_mode = eval_mode_j; 23325daefc96SJeremy L Thompson } else { 23335daefc96SJeremy L Thompson // For non-tensor can only re-use with the same eval mode 23345daefc96SJeremy L Thompson if (eval_mode_i == eval_mode_j) { 23355daefc96SJeremy L Thompson input_matrix_reuse[i].index = j; 23365daefc96SJeremy L Thompson input_matrix_reuse[i].is_input = true; 23375daefc96SJeremy L Thompson input_matrix_reuse[i].eval_mode = eval_mode_j; 23385daefc96SJeremy L Thompson } 23395daefc96SJeremy L Thompson } 23405daefc96SJeremy L Thompson } 23415daefc96SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 23425daefc96SJeremy L Thompson } 23435daefc96SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 23445daefc96SJeremy L Thompson } 23455daefc96SJeremy L Thompson 23465daefc96SJeremy L Thompson // -- Determine output mat reuse 23475daefc96SJeremy L Thompson FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 23485daefc96SJeremy L Thompson 23495daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 23505daefc96SJeremy L Thompson output_matrix_reuse[i].index = -1; 23515daefc96SJeremy L Thompson } 23525daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 23535daefc96SJeremy L Thompson bool is_tensor = true; 23545daefc96SJeremy L Thompson CeedEvalMode eval_mode_i; 23555daefc96SJeremy L Thompson CeedBasis basis_i; 23565daefc96SJeremy L Thompson 23575daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 23585daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 23595daefc96SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 23605daefc96SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 23615daefc96SJeremy L Thompson CeedEvalMode eval_mode_j; 23625daefc96SJeremy L Thompson CeedBasis basis_j; 23635daefc96SJeremy L Thompson 23645daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 23655daefc96SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 23665daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 23675daefc96SJeremy L Thompson if (basis_i == basis_j) { 23685daefc96SJeremy L Thompson if (is_tensor) { 23695daefc96SJeremy L Thompson output_matrix_reuse[i].index = j; 23705daefc96SJeremy L Thompson output_matrix_reuse[i].is_input = true; 23715daefc96SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 23725daefc96SJeremy L Thompson } else { 23735daefc96SJeremy L Thompson // For non-tensor can only re-use with the same eval mode 23745daefc96SJeremy L Thompson if (eval_mode_i == eval_mode_j) { 23755daefc96SJeremy L Thompson output_matrix_reuse[i].index = j; 23765daefc96SJeremy L Thompson output_matrix_reuse[i].is_input = true; 23775daefc96SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 23785daefc96SJeremy L Thompson } 23795daefc96SJeremy L Thompson } 23805daefc96SJeremy L Thompson } 23815daefc96SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 23825daefc96SJeremy L Thompson } 23835daefc96SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 23845daefc96SJeremy L Thompson CeedEvalMode eval_mode_j; 23855daefc96SJeremy L Thompson CeedBasis basis_j; 23865daefc96SJeremy L Thompson 23875daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 23885daefc96SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 23895daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 23905daefc96SJeremy L Thompson if (basis_i == basis_j) { 23915daefc96SJeremy L Thompson if (is_tensor) { 23925daefc96SJeremy L Thompson output_matrix_reuse[i].index = j; 23935daefc96SJeremy L Thompson output_matrix_reuse[i].is_input = false; 23945daefc96SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 23955daefc96SJeremy L Thompson } else { 23965daefc96SJeremy L Thompson // For non-tensor can only re-use with the same eval mode 23975daefc96SJeremy L Thompson if (eval_mode_i == eval_mode_j) { 23985daefc96SJeremy L Thompson output_matrix_reuse[i].index = j; 23995daefc96SJeremy L Thompson output_matrix_reuse[i].is_input = false; 24005daefc96SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 24015daefc96SJeremy L Thompson } 24025daefc96SJeremy L Thompson } 24035daefc96SJeremy L Thompson } 24045daefc96SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 24055daefc96SJeremy L Thompson } 24065daefc96SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 24075daefc96SJeremy L Thompson } 24085daefc96SJeremy L Thompson 24095daefc96SJeremy L Thompson // Initialize constants, and matrices B and G 24105daefc96SJeremy L Thompson code << "\n" << tab << "// Input field constants and basis data\n"; 24115daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 24125daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 2413ca1da9b9SJeremy L Thompson max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true)); 24145daefc96SJeremy L Thompson } 24155daefc96SJeremy L Thompson code << "\n" << tab << "// Output field constants and basis data\n"; 24165daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 24175daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 2418ca1da9b9SJeremy L Thompson max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true)); 24195daefc96SJeremy L Thompson } 24205daefc96SJeremy L Thompson 24215daefc96SJeremy L Thompson // Loop over all elements 24225daefc96SJeremy L Thompson code << "\n" << tab << "// Element loop\n"; 24235daefc96SJeremy L Thompson code << tab << "__syncthreads();\n"; 24245daefc96SJeremy L Thompson code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 24255daefc96SJeremy L Thompson tab.push(); 24265daefc96SJeremy L Thompson 24275daefc96SJeremy L Thompson // -- Compute minimum buffer space needed 24285daefc96SJeremy L Thompson CeedInt max_rstr_buffer_size = 1; 24295daefc96SJeremy L Thompson 24305daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 24315daefc96SJeremy L Thompson CeedEvalMode eval_mode; 24325daefc96SJeremy L Thompson 24335daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 24345daefc96SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 24355daefc96SJeremy L Thompson CeedInt num_comp; 24365daefc96SJeremy L Thompson CeedElemRestriction elem_rstr; 24375daefc96SJeremy L Thompson 24385daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 24395daefc96SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 24405daefc96SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 24415daefc96SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 24425daefc96SJeremy L Thompson } 24435daefc96SJeremy L Thompson } 24445daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 24455daefc96SJeremy L Thompson CeedEvalMode eval_mode; 24465daefc96SJeremy L Thompson 24475daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 24485daefc96SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE) { 24495daefc96SJeremy L Thompson CeedInt num_comp; 24505daefc96SJeremy L Thompson CeedElemRestriction elem_rstr; 24515daefc96SJeremy L Thompson 24525daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 24535daefc96SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 24545daefc96SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 24555daefc96SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 24565daefc96SJeremy L Thompson } 24575daefc96SJeremy L Thompson } 24585daefc96SJeremy L Thompson code << tab << "// Scratch restriction buffer space\n"; 24595daefc96SJeremy L Thompson code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 24605daefc96SJeremy L Thompson 24615daefc96SJeremy L Thompson // -- Determine best input field processing order 24625daefc96SJeremy L Thompson CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 24635daefc96SJeremy L Thompson 24645daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 24655daefc96SJeremy L Thompson field_rstr_in_buffer[i] = -1; 24665daefc96SJeremy L Thompson input_field_order[i] = -1; 24675daefc96SJeremy L Thompson } 24685daefc96SJeremy L Thompson { 24695daefc96SJeremy L Thompson bool is_ordered[CEED_FIELD_MAX]; 24705daefc96SJeremy L Thompson CeedInt curr_index = 0; 24715daefc96SJeremy L Thompson 24725daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 24735daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 24745daefc96SJeremy L Thompson CeedVector vec_i; 24755daefc96SJeremy L Thompson CeedElemRestriction rstr_i; 24765daefc96SJeremy L Thompson 24775daefc96SJeremy L Thompson if (is_ordered[i]) continue; 24785daefc96SJeremy L Thompson field_rstr_in_buffer[i] = i; 24795daefc96SJeremy L Thompson is_ordered[i] = true; 24805daefc96SJeremy L Thompson input_field_order[curr_index] = i; 24815daefc96SJeremy L Thompson curr_index++; 24825daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 24835daefc96SJeremy L Thompson if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 24845daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 24855daefc96SJeremy L Thompson for (CeedInt j = i + 1; j < num_input_fields; j++) { 24865daefc96SJeremy L Thompson CeedVector vec_j; 24875daefc96SJeremy L Thompson CeedElemRestriction rstr_j; 24885daefc96SJeremy L Thompson 24895daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 24905daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 24915daefc96SJeremy L Thompson if (rstr_i == rstr_j && vec_i == vec_j) { 24925daefc96SJeremy L Thompson field_rstr_in_buffer[j] = i; 24935daefc96SJeremy L Thompson is_ordered[j] = true; 24945daefc96SJeremy L Thompson input_field_order[curr_index] = j; 24955daefc96SJeremy L Thompson curr_index++; 24965daefc96SJeremy L Thompson } 24975daefc96SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_j)); 24985daefc96SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 24995daefc96SJeremy L Thompson } 25005daefc96SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_i)); 25015daefc96SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 25025daefc96SJeremy L Thompson } 25035daefc96SJeremy L Thompson } 25045daefc96SJeremy L Thompson 25055daefc96SJeremy L Thompson // -- Input restriction and basis 25065daefc96SJeremy L Thompson code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 25075daefc96SJeremy L Thompson CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0; 25085daefc96SJeremy L Thompson CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX]; 25095daefc96SJeremy L Thompson 25105daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 25115daefc96SJeremy L Thompson bool is_active = false; 25125daefc96SJeremy L Thompson const char *field_name; 25135daefc96SJeremy L Thompson const CeedInt f = input_field_order[i]; 25145daefc96SJeremy L Thompson 25155daefc96SJeremy L Thompson { 25165daefc96SJeremy L Thompson CeedVector vec; 25175daefc96SJeremy L Thompson 25185daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 25195daefc96SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 25205daefc96SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 25215daefc96SJeremy L Thompson } 25225daefc96SJeremy L Thompson 25235daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 25245daefc96SJeremy L Thompson code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 25255daefc96SJeremy L Thompson 25265daefc96SJeremy L Thompson if (is_active) { 25275daefc96SJeremy L Thompson CeedEvalMode eval_mode; 25285daefc96SJeremy L Thompson CeedInt field_size; 25295daefc96SJeremy L Thompson 25305daefc96SJeremy L Thompson active_fields_in[num_active_in] = f; 25315daefc96SJeremy L Thompson num_active_in++; 25325daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size)); 25335daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode)); 25345daefc96SJeremy L Thompson if (eval_mode == CEED_EVAL_GRAD) { 25355daefc96SJeremy L Thompson code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*" 25365daefc96SJeremy L Thompson << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; 25375daefc96SJeremy L Thompson } else { 25385daefc96SJeremy L Thompson code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; 25395daefc96SJeremy L Thompson } 25405daefc96SJeremy L Thompson code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n"; 25415daefc96SJeremy L Thompson } else { 25425daefc96SJeremy L Thompson // ---- Restriction 25435daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 25445daefc96SJeremy L Thompson max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 25455daefc96SJeremy L Thompson 25465daefc96SJeremy L Thompson // ---- Basis action 25475daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 25485daefc96SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 25495daefc96SJeremy L Thompson } 25505daefc96SJeremy L Thompson } 25515daefc96SJeremy L Thompson code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {"; 25525daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_active_in; i++) { 25535daefc96SJeremy L Thompson code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); 25545daefc96SJeremy L Thompson } 25555daefc96SJeremy L Thompson code << "};\n"; 25565daefc96SJeremy L Thompson code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {"; 25575daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_active_in; i++) { 25585daefc96SJeremy L Thompson code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); 25595daefc96SJeremy L Thompson } 25605daefc96SJeremy L Thompson code << "};\n"; 25615daefc96SJeremy L Thompson 25625daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 25635daefc96SJeremy L Thompson bool is_active = false; 25645daefc96SJeremy L Thompson 25655daefc96SJeremy L Thompson { 25665daefc96SJeremy L Thompson CeedVector vec; 25675daefc96SJeremy L Thompson 25685daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 25695daefc96SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 25705daefc96SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 25715daefc96SJeremy L Thompson } 25725daefc96SJeremy L Thompson if (is_active) { 25735daefc96SJeremy L Thompson const char *field_name; 25745daefc96SJeremy L Thompson CeedInt field_size; 25755daefc96SJeremy L Thompson 25765daefc96SJeremy L Thompson active_fields_out[num_active_out] = i; 25775daefc96SJeremy L Thompson num_active_out++; 25785daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); 25795daefc96SJeremy L Thompson qf_assembly_size_out += field_size; 25805daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 25815daefc96SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 25825daefc96SJeremy L Thompson code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n"; 25835daefc96SJeremy L Thompson } 25845daefc96SJeremy L Thompson } 25855daefc96SJeremy L Thompson code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {"; 25865daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_active_out; i++) { 25875daefc96SJeremy L Thompson code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : ""); 25885daefc96SJeremy L Thompson } 25895daefc96SJeremy L Thompson code << "};\n"; 25905daefc96SJeremy L Thompson code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n"; 25915daefc96SJeremy L Thompson 25925daefc96SJeremy L Thompson // -- Loop over active field 25935daefc96SJeremy L Thompson code << "\n" << tab << "CeedInt input_offset = 0;\n"; 25945daefc96SJeremy L Thompson code << tab << "// Loop over active QFunction input fields\n"; 25955daefc96SJeremy L Thompson code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n"; 25965daefc96SJeremy L Thompson code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n"; 25975daefc96SJeremy L Thompson tab.push(); 25985daefc96SJeremy L Thompson 25995daefc96SJeremy L Thompson // -- Loop over size of active field 26005daefc96SJeremy L Thompson code << "\n" << tab << "// Loop over current active input field size\n"; 26015daefc96SJeremy L Thompson code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n"; 26025daefc96SJeremy L Thompson code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n"; 26035daefc96SJeremy L Thompson tab.push(); 26045daefc96SJeremy L Thompson 26055daefc96SJeremy L Thompson // -- Set current active point and component to 1 26065daefc96SJeremy L Thompson code << tab << "// Set current active point and component to 1.0\n"; 26075daefc96SJeremy L Thompson if (is_all_tensor && (max_dim >= 3)) { 26085daefc96SJeremy L Thompson code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n"; 26095daefc96SJeremy L Thompson } else { 26105daefc96SJeremy L Thompson code << tab << "r_q_in[a][s] = 1.0;\n"; 26115daefc96SJeremy L Thompson } 26125daefc96SJeremy L Thompson 26135daefc96SJeremy L Thompson // -- Q function 26145daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 26155daefc96SJeremy L Thompson qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 2616*745f16d1SZach Atkins Q_1d, is_all_tensor, is_at_points, use_3d_slices, true)); 26175daefc96SJeremy L Thompson 26185daefc96SJeremy L Thompson // -- Output basis and restriction 26195daefc96SJeremy L Thompson code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 26205daefc96SJeremy L Thompson CeedScalar offset = 0; 26215daefc96SJeremy L Thompson 26225daefc96SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 26235daefc96SJeremy L Thompson bool is_active = false; 26245daefc96SJeremy L Thompson const char *field_name; 26255daefc96SJeremy L Thompson 26265daefc96SJeremy L Thompson { 26275daefc96SJeremy L Thompson CeedVector vec; 26285daefc96SJeremy L Thompson 26295daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 26305daefc96SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 26315daefc96SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 26325daefc96SJeremy L Thompson } 26335daefc96SJeremy L Thompson if (!is_active) continue; 26345daefc96SJeremy L Thompson 26355daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 26365daefc96SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 26375daefc96SJeremy L Thompson 26385daefc96SJeremy L Thompson // ---- Restriction 26395daefc96SJeremy L Thompson CeedInt field_size; 26405daefc96SJeremy L Thompson 26415daefc96SJeremy L Thompson code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", " 26425daefc96SJeremy L Thompson << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n"; 26435daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); 26445daefc96SJeremy L Thompson offset += field_size; 26455daefc96SJeremy L Thompson } 26465daefc96SJeremy L Thompson 26475daefc96SJeremy L Thompson // -- Reset current active node and component 26485daefc96SJeremy L Thompson code << "\n" << tab << "// Reset current active node and component to 0.0\n"; 26495daefc96SJeremy L Thompson if (is_all_tensor && (max_dim >= 3)) { 26505daefc96SJeremy L Thompson code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n"; 26515daefc96SJeremy L Thompson } else { 26525daefc96SJeremy L Thompson code << tab << "r_q_in[a][s] = 0.0;\n"; 26535daefc96SJeremy L Thompson } 26545daefc96SJeremy L Thompson 26555daefc96SJeremy L Thompson // -- End of loop over size of active field 26565daefc96SJeremy L Thompson tab.pop(); 26575daefc96SJeremy L Thompson code << tab << "}\n"; 26585daefc96SJeremy L Thompson code << tab << "input_offset += field_size_in;\n"; 26595daefc96SJeremy L Thompson 26605daefc96SJeremy L Thompson // -- End of loop over active field 26615daefc96SJeremy L Thompson tab.pop(); 26625daefc96SJeremy L Thompson code << tab << "}\n"; 26635daefc96SJeremy L Thompson 26645daefc96SJeremy L Thompson // Close loop and function 26655daefc96SJeremy L Thompson tab.pop(); 26665daefc96SJeremy L Thompson code << tab << "}\n"; 26675daefc96SJeremy L Thompson tab.pop(); 26685daefc96SJeremy L Thompson code << tab << "}\n"; 26695daefc96SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n\n"; 26705daefc96SJeremy L Thompson 26715daefc96SJeremy L Thompson CeedInt block_sizes[3] = {0, 0, 0}; 26725daefc96SJeremy L Thompson CeedInt num_elem; 26735daefc96SJeremy L Thompson 26745daefc96SJeremy L Thompson // Compile 26755daefc96SJeremy L Thompson CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 26765daefc96SJeremy L Thompson CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); 26775daefc96SJeremy L Thompson { 26785daefc96SJeremy L Thompson bool is_compile_good = false; 26795daefc96SJeremy L Thompson 26805daefc96SJeremy L Thompson data->thread_1d = block_sizes[0]; 26815daefc96SJeremy L Thompson CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0], 26825daefc96SJeremy L Thompson "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2])); 26835daefc96SJeremy L Thompson if (is_compile_good) { 26845daefc96SJeremy L Thompson *is_good_build = true; 26855daefc96SJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction)); 26865daefc96SJeremy L Thompson } else { 26875daefc96SJeremy L Thompson *is_good_build = false; 26885daefc96SJeremy L Thompson data->use_assembly_fallback = true; 26895daefc96SJeremy L Thompson } 26905daefc96SJeremy L Thompson } 26915daefc96SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 26925daefc96SJeremy L Thompson CeedCallBackend(CeedQFunctionDestroy(&qf)); 26935daefc96SJeremy L Thompson return CEED_ERROR_SUCCESS; 26945daefc96SJeremy L Thompson } 2695692716b7SZach Atkins 26960183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 2697