1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 37d8d0e25Snbeams // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 57d8d0e25Snbeams // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 73d576824SJeremy L Thompson 87d8d0e25Snbeams #define CEED_DEBUG_COLOR 12 97d8d0e25Snbeams 1049aac155SJeremy L Thompson #include <ceed.h> 11ec3da8bcSJed Brown #include <ceed/backend.h> 12*0183ed61SJeremy L Thompson #include <ceed/gen-tools.h> 139e201c85SYohann #include <ceed/jit-tools.h> 142b730f8bSJeremy L Thompson 157d8d0e25Snbeams #include <iostream> 167d8d0e25Snbeams #include <sstream> 172b730f8bSJeremy L Thompson #include <string> 182b730f8bSJeremy L Thompson 190d0321e0SJeremy L Thompson #include "../hip-ref/ceed-hip-ref.h" 207d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h" 21b2165e7aSSebastian Grimberg #include "../hip/ceed-hip-common.h" 227d8d0e25Snbeams #include "../hip/ceed-hip-compile.h" 232b730f8bSJeremy L Thompson #include "ceed-hip-gen.h" 24b3e1519bSnbeams 2545a787f7SJeremy L Thompson struct FieldReuse_Hip { 2645a787f7SJeremy L Thompson CeedInt index; 2745a787f7SJeremy L Thompson bool is_input; 2845a787f7SJeremy L Thompson CeedEvalMode eval_mode; 2945a787f7SJeremy L Thompson }; 3045a787f7SJeremy L Thompson 31b3e1519bSnbeams //------------------------------------------------------------------------------ 32b3e1519bSnbeams // Calculate the block size used for launching the operator kernel 33b3e1519bSnbeams //------------------------------------------------------------------------------ 342b730f8bSJeremy L Thompson extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) { 353a2968d6SJeremy L Thompson const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); 36b3e1519bSnbeams if (dim == 1) { 373a2968d6SJeremy L Thompson CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; 38b7453713SJeremy L Thompson 399e201c85SYohann elems_per_block = elems_per_block > 0 ? elems_per_block : 1; 403a2968d6SJeremy L Thompson block_sizes[0] = thread_1d; 41b3e1519bSnbeams block_sizes[1] = 1; 429e201c85SYohann block_sizes[2] = elems_per_block; 43b3e1519bSnbeams } else if (dim == 2) { 443a2968d6SJeremy L Thompson const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2; 45b7453713SJeremy L Thompson 463a2968d6SJeremy L Thompson block_sizes[0] = thread_1d; 473a2968d6SJeremy L Thompson block_sizes[1] = thread_1d; 489e201c85SYohann block_sizes[2] = elems_per_block; 49b3e1519bSnbeams } else if (dim == 3) { 503a2968d6SJeremy L Thompson const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1); 51b7453713SJeremy L Thompson 523a2968d6SJeremy L Thompson block_sizes[0] = thread_1d; 533a2968d6SJeremy L Thompson block_sizes[1] = thread_1d; 549e201c85SYohann block_sizes[2] = elems_per_block; 55b3e1519bSnbeams } 56b3e1519bSnbeams return CEED_ERROR_SUCCESS; 57b3e1519bSnbeams } 58b3e1519bSnbeams 597d8d0e25Snbeams //------------------------------------------------------------------------------ 604b3e95d5SJeremy L Thompson // Determine type of operator 614b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 624b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 634b3e95d5SJeremy L Thompson CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields, 6474398b5aSJeremy L Thompson CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d, 6574398b5aSJeremy L Thompson CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) { 6674398b5aSJeremy L Thompson // Check if all are tensor 6774398b5aSJeremy L Thompson *is_all_tensor = true; 684b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 694b3e95d5SJeremy L Thompson CeedBasis basis; 704b3e95d5SJeremy L Thompson 714b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 724b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 734b3e95d5SJeremy L Thompson bool is_field_tensor; 744b3e95d5SJeremy L Thompson 754b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 7674398b5aSJeremy L Thompson *is_all_tensor = *is_all_tensor && is_field_tensor; 774b3e95d5SJeremy L Thompson } 783a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 794b3e95d5SJeremy L Thompson } 804b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 814b3e95d5SJeremy L Thompson CeedBasis basis; 824b3e95d5SJeremy L Thompson 834b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 844b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 854b3e95d5SJeremy L Thompson bool is_field_tensor; 864b3e95d5SJeremy L Thompson 874b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 8874398b5aSJeremy L Thompson *is_all_tensor = *is_all_tensor && is_field_tensor; 8974398b5aSJeremy L Thompson } 9074398b5aSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 9174398b5aSJeremy L Thompson } 9274398b5aSJeremy L Thompson 9374398b5aSJeremy L Thompson // Find max_P, max_P_1d, Q, and Q_1d 9474398b5aSJeremy L Thompson bool is_all_3d = true; 9574398b5aSJeremy L Thompson 9674398b5aSJeremy L Thompson *max_P = 0; 9774398b5aSJeremy L Thompson *max_P_1d = 0; 9874398b5aSJeremy L Thompson *Q = 0; 9974398b5aSJeremy L Thompson *Q_1d = 0; 10074398b5aSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 10174398b5aSJeremy L Thompson CeedBasis basis; 10274398b5aSJeremy L Thompson 10374398b5aSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 10474398b5aSJeremy L Thompson if (basis != CEED_BASIS_NONE) { 10574398b5aSJeremy L Thompson bool is_field_tensor; 10674398b5aSJeremy L Thompson CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; 10774398b5aSJeremy L Thompson 10874398b5aSJeremy L Thompson // Check if 3D 1094b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 11074398b5aSJeremy L Thompson is_all_3d = is_all_3d && (field_dim == 3); 11174398b5aSJeremy L Thompson *max_dim = CeedIntMax(*max_dim, field_dim); 11274398b5aSJeremy L Thompson 11374398b5aSJeremy L Thompson // Collect P, P_1d, Q, and Q_1d 11474398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); 11574398b5aSJeremy L Thompson *max_P = CeedIntMax(*max_P, field_P); 11674398b5aSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 11774398b5aSJeremy L Thompson if (is_field_tensor) { 11874398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 11974398b5aSJeremy L Thompson *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 12074398b5aSJeremy L Thompson } 12174398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); 12274398b5aSJeremy L Thompson CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 12374398b5aSJeremy L Thompson *Q = field_Q; 12474398b5aSJeremy L Thompson if (is_field_tensor) { 12574398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 1264b3e95d5SJeremy L Thompson CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 1274b3e95d5SJeremy L Thompson *Q_1d = field_Q_1d; 1284b3e95d5SJeremy L Thompson } 12974398b5aSJeremy L Thompson } 13074398b5aSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 13174398b5aSJeremy L Thompson } 13274398b5aSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 13374398b5aSJeremy L Thompson CeedBasis basis; 13474398b5aSJeremy L Thompson 13574398b5aSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 13674398b5aSJeremy L Thompson if (basis != CEED_BASIS_NONE) { 13774398b5aSJeremy L Thompson bool is_field_tensor; 13874398b5aSJeremy L Thompson CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; 13974398b5aSJeremy L Thompson 14074398b5aSJeremy L Thompson // Check if 3D 14174398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 14274398b5aSJeremy L Thompson is_all_3d = is_all_3d && (field_dim == 3); 14374398b5aSJeremy L Thompson *max_dim = CeedIntMax(*max_dim, field_dim); 14474398b5aSJeremy L Thompson 14574398b5aSJeremy L Thompson // Collect P, P_1d, Q, and Q_1d 14674398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); 14774398b5aSJeremy L Thompson *max_P = CeedIntMax(*max_P, field_P); 14874398b5aSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 14974398b5aSJeremy L Thompson if (is_field_tensor) { 15074398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 15174398b5aSJeremy L Thompson *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 15274398b5aSJeremy L Thompson } 15374398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); 15474398b5aSJeremy L Thompson CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 15574398b5aSJeremy L Thompson *Q = field_Q; 15674398b5aSJeremy L Thompson if (is_field_tensor) { 15774398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 15874398b5aSJeremy L Thompson CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 15974398b5aSJeremy L Thompson *Q_1d = field_Q_1d; 16074398b5aSJeremy L Thompson } 16174398b5aSJeremy L Thompson } 1623a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 1634b3e95d5SJeremy L Thompson } 1644b3e95d5SJeremy L Thompson 1654b3e95d5SJeremy L Thompson // Only use 3D collocated gradient parallelization strategy when gradient is computed 1664b3e95d5SJeremy L Thompson *use_3d_slices = false; 16774398b5aSJeremy L Thompson if (is_all_3d && *is_all_tensor) { 1684b3e95d5SJeremy L Thompson bool was_grad_found = false; 1694b3e95d5SJeremy L Thompson 1704b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1714b3e95d5SJeremy L Thompson CeedEvalMode eval_mode; 1724b3e95d5SJeremy L Thompson 1734b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1744b3e95d5SJeremy L Thompson if (eval_mode == CEED_EVAL_GRAD) { 1754b3e95d5SJeremy L Thompson CeedBasis_Hip_shared *basis_data; 1764b3e95d5SJeremy L Thompson CeedBasis basis; 1774b3e95d5SJeremy L Thompson 1784b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 1794b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 1804b3e95d5SJeremy L Thompson *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 1814b3e95d5SJeremy L Thompson was_grad_found = true; 1823a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 1834b3e95d5SJeremy L Thompson } 1844b3e95d5SJeremy L Thompson } 1854b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 1864b3e95d5SJeremy L Thompson CeedEvalMode eval_mode; 1874b3e95d5SJeremy L Thompson 1884b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1894b3e95d5SJeremy L Thompson if (eval_mode == CEED_EVAL_GRAD) { 1904b3e95d5SJeremy L Thompson CeedBasis_Hip_shared *basis_data; 1914b3e95d5SJeremy L Thompson CeedBasis basis; 1924b3e95d5SJeremy L Thompson 1934b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 1944b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 1954b3e95d5SJeremy L Thompson *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 1964b3e95d5SJeremy L Thompson was_grad_found = true; 1973a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 1984b3e95d5SJeremy L Thompson } 1994b3e95d5SJeremy L Thompson } 2004b3e95d5SJeremy L Thompson } 2014b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 2024b3e95d5SJeremy L Thompson } 2034b3e95d5SJeremy L Thompson 2044b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 2054b3e95d5SJeremy L Thompson // Setup fields 2064b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 207*0183ed61SJeremy L Thompson static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, 208*0183ed61SJeremy L Thompson CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse, 209*0183ed61SJeremy L Thompson CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, 210*0183ed61SJeremy L Thompson bool use_3d_slices) { 21174398b5aSJeremy L Thompson bool is_tensor = true; 21274398b5aSJeremy L Thompson CeedBasis basis; 21374398b5aSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 21474398b5aSJeremy L Thompson if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 21574398b5aSJeremy L Thompson 21659fa3f92SJeremy L Thompson const char *field_name; 2174b3e95d5SJeremy L Thompson std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 2189123fb08SJeremy L Thompson std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 2194b3e95d5SJeremy L Thompson std::string option_name = (is_input ? "inputs" : "outputs"); 2204b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 22174398b5aSJeremy L Thompson CeedInt elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0; 2224b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 2234b3e95d5SJeremy L Thompson CeedBasis_Hip_shared *basis_data; 2244b3e95d5SJeremy L Thompson 2259ee499e5SJeremy L Thompson // Field reuse info 22645a787f7SJeremy L Thompson bool use_previous_field = field_reuse.index != -1; 2279ee499e5SJeremy L Thompson 22859fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name)); 229*0183ed61SJeremy L Thompson code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n"; 2304b3e95d5SJeremy L Thompson 2314b3e95d5SJeremy L Thompson // Get field data 2324b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 2334b3e95d5SJeremy L Thompson if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 2344b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 2354b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 2364b3e95d5SJeremy L Thompson } 2373a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2384b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 2394b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 24074398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 2419123fb08SJeremy L Thompson if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 2429123fb08SJeremy L Thompson else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 2434b3e95d5SJeremy L Thompson } 2444b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 2454b3e95d5SJeremy L Thompson 2464b3e95d5SJeremy L Thompson // Set field constants 247*0183ed61SJeremy L Thompson code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n"; 24874398b5aSJeremy L Thompson if (is_tensor && !is_all_tensor) { 24974398b5aSJeremy L Thompson CeedInt P = 0; 25074398b5aSJeremy L Thompson 25174398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 252*0183ed61SJeremy L Thompson code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n"; 25374398b5aSJeremy L Thompson } 254*0183ed61SJeremy L Thompson code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n"; 255343e3094SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 256*0183ed61SJeremy L Thompson code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; 2574b3e95d5SJeremy L Thompson } 2584b3e95d5SJeremy L Thompson 2594b3e95d5SJeremy L Thompson // Load basis data 260*0183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 2614b3e95d5SJeremy L Thompson switch (eval_mode) { 2624b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 2634b3e95d5SJeremy L Thompson break; 2644b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 2653a2968d6SJeremy L Thompson if (is_at_points) { 2663a2968d6SJeremy L Thompson // AtPoints 2673a2968d6SJeremy L Thompson if (!basis_data->d_chebyshev_interp_1d) { 2683a2968d6SJeremy L Thompson CeedSize interp_bytes; 2693a2968d6SJeremy L Thompson CeedScalar *chebyshev_interp_1d; 2703a2968d6SJeremy L Thompson 2713a2968d6SJeremy L Thompson interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 2723a2968d6SJeremy L Thompson CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 2733a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 2743a2968d6SJeremy L Thompson CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 2753a2968d6SJeremy L Thompson CeedCallHip(CeedBasisReturnCeed(basis), 2763a2968d6SJeremy L Thompson hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 2773a2968d6SJeremy L Thompson CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 2783a2968d6SJeremy L Thompson } 2793a2968d6SJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 2803a2968d6SJeremy L Thompson else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 2813a2968d6SJeremy L Thompson } else { 2823a2968d6SJeremy L Thompson // Standard quadrature 2834b3e95d5SJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 2844b3e95d5SJeremy L Thompson else data->B.outputs[i] = basis_data->d_interp_1d; 2853a2968d6SJeremy L Thompson } 2869ee499e5SJeremy L Thompson if (use_previous_field) { 28745a787f7SJeremy L Thompson std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 2889ee499e5SJeremy L Thompson 289*0183ed61SJeremy L Thompson code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; 2909ee499e5SJeremy L Thompson } else { 291*0183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 292*0183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 2939ee499e5SJeremy L Thompson } 2944b3e95d5SJeremy L Thompson break; 2954b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 2963a2968d6SJeremy L Thompson if (is_at_points) { 2973a2968d6SJeremy L Thompson // AtPoints 2983a2968d6SJeremy L Thompson if (!basis_data->d_chebyshev_interp_1d) { 2993a2968d6SJeremy L Thompson CeedSize interp_bytes; 3003a2968d6SJeremy L Thompson CeedScalar *chebyshev_interp_1d; 3013a2968d6SJeremy L Thompson 3023a2968d6SJeremy L Thompson interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 3033a2968d6SJeremy L Thompson CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 3043a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 3053a2968d6SJeremy L Thompson CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 3063a2968d6SJeremy L Thompson CeedCallHip(CeedBasisReturnCeed(basis), 3073a2968d6SJeremy L Thompson hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice)); 3083a2968d6SJeremy L Thompson CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 3093a2968d6SJeremy L Thompson } 3103a2968d6SJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 3113a2968d6SJeremy L Thompson else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 3123a2968d6SJeremy L Thompson } else { 3133a2968d6SJeremy L Thompson // Standard quadrature 3144b3e95d5SJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 3154b3e95d5SJeremy L Thompson else data->B.outputs[i] = basis_data->d_interp_1d; 3163a2968d6SJeremy L Thompson } 3179123fb08SJeremy L Thompson if (is_tensor) { 3189ee499e5SJeremy L Thompson if (use_previous_field) { 31945a787f7SJeremy L Thompson std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 3209ee499e5SJeremy L Thompson 321*0183ed61SJeremy L Thompson code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; 3229ee499e5SJeremy L Thompson } else { 323*0183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 324*0183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 3259123fb08SJeremy L Thompson } 3269ee499e5SJeremy L Thompson } 3273a2968d6SJeremy L Thompson if (is_at_points) break; // No G mat for AtPoints 3284b3e95d5SJeremy L Thompson if (use_3d_slices) { 3294b3e95d5SJeremy L Thompson if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d; 3304b3e95d5SJeremy L Thompson else data->G.outputs[i] = basis_data->d_collo_grad_1d; 33145a787f7SJeremy L Thompson if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) { 33245a787f7SJeremy L Thompson std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 3339ee499e5SJeremy L Thompson 334*0183ed61SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 3359ee499e5SJeremy L Thompson } else { 336*0183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 337*0183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 3389ee499e5SJeremy L Thompson } 3394b3e95d5SJeremy L Thompson } else { 3404b3e95d5SJeremy L Thompson bool has_collo_grad = basis_data->d_collo_grad_1d; 3414b3e95d5SJeremy L Thompson 3424b3e95d5SJeremy L Thompson if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 3434b3e95d5SJeremy L Thompson else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 3444b3e95d5SJeremy L Thompson if (has_collo_grad) { 34545a787f7SJeremy L Thompson if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) { 34645a787f7SJeremy L Thompson std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 3479ee499e5SJeremy L Thompson 348*0183ed61SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 3499ee499e5SJeremy L Thompson } else { 350*0183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 351*0183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 3529ee499e5SJeremy L Thompson } 3539ee499e5SJeremy L Thompson } else { 35445a787f7SJeremy L Thompson if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) { 35545a787f7SJeremy L Thompson std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 3569ee499e5SJeremy L Thompson 357*0183ed61SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 3584b3e95d5SJeremy L Thompson } else { 359*0183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") 36074398b5aSJeremy L Thompson << (is_tensor ? "" : var_suffix) << "];\n"; 361*0183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G." 36274398b5aSJeremy L Thompson << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 3634b3e95d5SJeremy L Thompson } 3644b3e95d5SJeremy L Thompson } 3659ee499e5SJeremy L Thompson } 3664b3e95d5SJeremy L Thompson break; 3674b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 3684b3e95d5SJeremy L Thompson break; // No action 3694b3e95d5SJeremy L Thompson // LCOV_EXCL_START 3704b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 3714b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 3724b3e95d5SJeremy L Thompson break; // TODO: Not implemented 3734b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 3744b3e95d5SJeremy L Thompson } 3753a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 3764b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 3774b3e95d5SJeremy L Thompson } 3784b3e95d5SJeremy L Thompson 3794b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 3804b3e95d5SJeremy L Thompson // Restriction 3814b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 382*0183ed61SJeremy L Thompson static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, 383*0183ed61SJeremy L Thompson CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, 384*0183ed61SJeremy L Thompson CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, 385*0183ed61SJeremy L Thompson bool use_3d_slices) { 3864b3e95d5SJeremy L Thompson std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 38774398b5aSJeremy L Thompson std::string P_name = (is_all_tensor ? "P_1d" : "P") + var_suffix; 3884b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 38974398b5aSJeremy L Thompson CeedInt elem_size = 0, num_comp = 0; 3904b3e95d5SJeremy L Thompson CeedSize l_size; 391f815fac9SJeremy L Thompson CeedRestrictionType rstr_type = CEED_RESTRICTION_STANDARD; 3924b3e95d5SJeremy L Thompson CeedElemRestriction_Hip *rstr_data; 3934b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 3944b3e95d5SJeremy L Thompson 3954b3e95d5SJeremy L Thompson // Get field data 3964b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 3974b3e95d5SJeremy L Thompson if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 398f815fac9SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); 3994b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 4004b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 4014b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 4024b3e95d5SJeremy L Thompson } 4034b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 4044b3e95d5SJeremy L Thompson 4054b3e95d5SJeremy L Thompson // Restriction 4064b3e95d5SJeremy L Thompson if (is_input) { 4074b3e95d5SJeremy L Thompson // Input 408e93651e5SJeremy L Thompson if (field_input_buffer[i] != i) { 409e93651e5SJeremy L Thompson std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]); 410e93651e5SJeremy L Thompson 411e93651e5SJeremy L Thompson // Restriction was already done for previous input 412*0183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n"; 4133a2968d6SJeremy L Thompson } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) { 4143a2968d6SJeremy L Thompson if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) { 415e93651e5SJeremy L Thompson // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated 416*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n"; 4173a2968d6SJeremy L Thompson } else if (rstr_type != CEED_RESTRICTION_POINTS) { 418e93651e5SJeremy L Thompson // Otherwise we're using the scratch space 419*0183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 420e93651e5SJeremy L Thompson } 421f815fac9SJeremy L Thompson switch (rstr_type) { 422f815fac9SJeremy L Thompson case CEED_RESTRICTION_STANDARD: { 4234b3e95d5SJeremy L Thompson CeedInt comp_stride; 4244b3e95d5SJeremy L Thompson 4254b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 426*0183ed61SJeremy L Thompson code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 4274b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 428*0183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 4294b3e95d5SJeremy L Thompson data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 430*0183ed61SJeremy L Thompson code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " 431*0183ed61SJeremy L Thompson << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix 432*0183ed61SJeremy L Thompson << ");\n"; 433f815fac9SJeremy L Thompson break; 434f815fac9SJeremy L Thompson } 435f815fac9SJeremy L Thompson case CEED_RESTRICTION_STRIDED: { 4364b3e95d5SJeremy L Thompson bool has_backend_strides; 4374b3e95d5SJeremy L Thompson CeedInt num_elem; 4384b3e95d5SJeremy L Thompson 4394b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 4404b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 4414b3e95d5SJeremy L Thompson CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 4424b3e95d5SJeremy L Thompson 4434b3e95d5SJeremy L Thompson if (!has_backend_strides) { 4444b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 4454b3e95d5SJeremy L Thompson } 446*0183ed61SJeremy L Thompson code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 447*0183ed61SJeremy L Thompson << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 448*0183ed61SJeremy L Thompson code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides" 449*0183ed61SJeremy L Thompson << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e" 450*0183ed61SJeremy L Thompson << var_suffix << ");\n"; 451f815fac9SJeremy L Thompson break; 452f815fac9SJeremy L Thompson } 4533a2968d6SJeremy L Thompson case CEED_RESTRICTION_POINTS: { 4543a2968d6SJeremy L Thompson CeedInt comp_stride; 4553a2968d6SJeremy L Thompson 4563a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 457*0183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 4583a2968d6SJeremy L Thompson data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 4593a2968d6SJeremy L Thompson break; 4603a2968d6SJeremy L Thompson } 461f815fac9SJeremy L Thompson // LCOV_EXCL_START 462f815fac9SJeremy L Thompson case CEED_RESTRICTION_ORIENTED: 463f815fac9SJeremy L Thompson case CEED_RESTRICTION_CURL_ORIENTED: 464f815fac9SJeremy L Thompson break; // TODO: Not implemented 465f815fac9SJeremy L Thompson // LCOV_EXCL_STOP 4664b3e95d5SJeremy L Thompson } 4674b3e95d5SJeremy L Thompson } 4684b3e95d5SJeremy L Thompson } else { 4694b3e95d5SJeremy L Thompson // Output 470f815fac9SJeremy L Thompson switch (rstr_type) { 471f815fac9SJeremy L Thompson case CEED_RESTRICTION_STANDARD: { 4724b3e95d5SJeremy L Thompson CeedInt comp_stride; 4734b3e95d5SJeremy L Thompson 4744b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 475*0183ed61SJeremy L Thompson code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 4764b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 477*0183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 4784b3e95d5SJeremy L Thompson data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 479*0183ed61SJeremy L Thompson code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " 480*0183ed61SJeremy L Thompson << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix 481*0183ed61SJeremy L Thompson << ");\n"; 482f815fac9SJeremy L Thompson break; 483f815fac9SJeremy L Thompson } 484f815fac9SJeremy L Thompson case CEED_RESTRICTION_STRIDED: { 4854b3e95d5SJeremy L Thompson bool has_backend_strides; 4864b3e95d5SJeremy L Thompson CeedInt num_elem; 4874b3e95d5SJeremy L Thompson 4884b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 4894b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 4904b3e95d5SJeremy L Thompson CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 4914b3e95d5SJeremy L Thompson 4924b3e95d5SJeremy L Thompson if (!has_backend_strides) { 4934b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 4944b3e95d5SJeremy L Thompson } 495*0183ed61SJeremy L Thompson code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 496*0183ed61SJeremy L Thompson << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 497*0183ed61SJeremy L Thompson code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides" 498*0183ed61SJeremy L Thompson << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix 499*0183ed61SJeremy L Thompson << ");\n"; 500f815fac9SJeremy L Thompson break; 501f815fac9SJeremy L Thompson } 5023a2968d6SJeremy L Thompson case CEED_RESTRICTION_POINTS: 5033a2968d6SJeremy L Thompson data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 5043a2968d6SJeremy L Thompson break; 505f815fac9SJeremy L Thompson // LCOV_EXCL_START 506f815fac9SJeremy L Thompson case CEED_RESTRICTION_ORIENTED: 507f815fac9SJeremy L Thompson case CEED_RESTRICTION_CURL_ORIENTED: 508f815fac9SJeremy L Thompson break; // TODO: Not implemented 509f815fac9SJeremy L Thompson // LCOV_EXCL_STOP 5104b3e95d5SJeremy L Thompson } 5114b3e95d5SJeremy L Thompson } 5123a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 5134b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 5144b3e95d5SJeremy L Thompson } 5154b3e95d5SJeremy L Thompson 5164b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 5174b3e95d5SJeremy L Thompson // Basis 5184b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 519*0183ed61SJeremy L Thompson static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field, 52074398b5aSJeremy L Thompson CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, 5213a2968d6SJeremy L Thompson bool is_at_points, bool use_3d_slices) { 52274398b5aSJeremy L Thompson bool is_tensor = true; 52374398b5aSJeremy L Thompson CeedBasis basis; 52474398b5aSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 52574398b5aSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 52674398b5aSJeremy L Thompson 5274b3e95d5SJeremy L Thompson std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 5289123fb08SJeremy L Thompson std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 5294b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 53074398b5aSJeremy L Thompson CeedInt dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0; 5314b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 5324b3e95d5SJeremy L Thompson 5334b3e95d5SJeremy L Thompson // Get field data 5344b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 5354b3e95d5SJeremy L Thompson if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 5364b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 5374b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 5384b3e95d5SJeremy L Thompson } 5393a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 5404b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 54174398b5aSJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 5429123fb08SJeremy L Thompson if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 5439123fb08SJeremy L Thompson else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 5444b3e95d5SJeremy L Thompson } 5454b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 5464b3e95d5SJeremy L Thompson 5474b3e95d5SJeremy L Thompson // Basis 548*0183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 5494b3e95d5SJeremy L Thompson if (is_input) { 5504b3e95d5SJeremy L Thompson switch (eval_mode) { 5514b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 5523a2968d6SJeremy L Thompson if (!use_3d_slices && !is_at_points) { 553*0183ed61SJeremy L Thompson code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n"; 5544b3e95d5SJeremy L Thompson } 5554b3e95d5SJeremy L Thompson break; 5564b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 5573a2968d6SJeremy L Thompson if (is_at_points) { 5589123fb08SJeremy L Thompson std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 5599123fb08SJeremy L Thompson 560*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 561*0183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 5626b92dc4bSJeremy L Thompson << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 5633a2968d6SJeremy L Thompson } else { 56474398b5aSJeremy L Thompson std::string function_name = is_tensor 56574398b5aSJeremy L Thompson ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 56674398b5aSJeremy L Thompson : "InterpNonTensor"; 56774398b5aSJeremy L Thompson std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 5689123fb08SJeremy L Thompson 569*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 570*0183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e" 57174398b5aSJeremy L Thompson << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 5723a2968d6SJeremy L Thompson } 5734b3e95d5SJeremy L Thompson break; 5744b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 5753a2968d6SJeremy L Thompson if (is_at_points) { 5769123fb08SJeremy L Thompson std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 5779123fb08SJeremy L Thompson 578*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 579*0183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 5806b92dc4bSJeremy L Thompson << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 5813a2968d6SJeremy L Thompson } else if (use_3d_slices) { 5829123fb08SJeremy L Thompson std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d"; 5839123fb08SJeremy L Thompson 584*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 585*0183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 5866b92dc4bSJeremy L Thompson << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 5879123fb08SJeremy L Thompson } else if (is_tensor) { 5889123fb08SJeremy L Thompson bool is_collocated = dim == 3 && Q_1d >= P_1d; 58974398b5aSJeremy L Thompson std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" + 59074398b5aSJeremy L Thompson (is_all_tensor ? "" : "Flattened"); 59174398b5aSJeremy L Thompson std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 5929123fb08SJeremy L Thompson 593*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" 59474398b5aSJeremy L Thompson << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n"; 595*0183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e" 59674398b5aSJeremy L Thompson << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 5974b3e95d5SJeremy L Thompson } else { 5989123fb08SJeremy L Thompson std::string function_name = "GradNonTensor"; 5999123fb08SJeremy L Thompson 600*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 601*0183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name 60274398b5aSJeremy L Thompson << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 6034b3e95d5SJeremy L Thompson } 6044b3e95d5SJeremy L Thompson break; 6054b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: { 6063a2968d6SJeremy L Thompson if (is_at_points) { 607*0183ed61SJeremy L Thompson code << tab << "// Nothing to do AtPoints\n"; 6083a2968d6SJeremy L Thompson } else { 6094b3e95d5SJeremy L Thompson CeedBasis_Hip_shared *basis_data; 61074398b5aSJeremy L Thompson std::string function_name = is_tensor 61174398b5aSJeremy L Thompson ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 61274398b5aSJeremy L Thompson : "WeightNonTensor"; 6134b3e95d5SJeremy L Thompson 614*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 6154b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 6164b3e95d5SJeremy L Thompson data->W = basis_data->d_q_weight_1d; 617*0183ed61SJeremy L Thompson code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n"; 6183a2968d6SJeremy L Thompson } 6194b3e95d5SJeremy L Thompson break; 6204b3e95d5SJeremy L Thompson } 6214b3e95d5SJeremy L Thompson // LCOV_EXCL_START 6224b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 6234b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 6244b3e95d5SJeremy L Thompson break; // TODO: Not implemented 6254b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 6264b3e95d5SJeremy L Thompson } 6274b3e95d5SJeremy L Thompson } else { 6284b3e95d5SJeremy L Thompson switch (eval_mode) { 6294b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 630*0183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n"; 6314b3e95d5SJeremy L Thompson break; // No action 6324b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 633*0183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 6343a2968d6SJeremy L Thompson if (is_at_points) { 6359123fb08SJeremy L Thompson std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 6369123fb08SJeremy L Thompson 637*0183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix 6386b92dc4bSJeremy L Thompson << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 6393a2968d6SJeremy L Thompson } else { 6409123fb08SJeremy L Thompson std::string function_name = 64174398b5aSJeremy L Thompson is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 64274398b5aSJeremy L Thompson : "InterpTransposeNonTensor"; 64374398b5aSJeremy L Thompson std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 6449123fb08SJeremy L Thompson 645*0183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q" 64674398b5aSJeremy L Thompson << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 6473a2968d6SJeremy L Thompson } 6484b3e95d5SJeremy L Thompson break; 6494b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 650*0183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 6513a2968d6SJeremy L Thompson if (is_at_points) { 6529123fb08SJeremy L Thompson std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 6539123fb08SJeremy L Thompson 654*0183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix 6556b92dc4bSJeremy L Thompson << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 6563a2968d6SJeremy L Thompson } else if (use_3d_slices) { 6579123fb08SJeremy L Thompson std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 6589123fb08SJeremy L Thompson 659*0183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix 6606b92dc4bSJeremy L Thompson << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 6619123fb08SJeremy L Thompson } else if (is_tensor) { 6629123fb08SJeremy L Thompson bool is_collocated = dim == 3 && Q_1d >= P_1d; 66374398b5aSJeremy L Thompson std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) + 66474398b5aSJeremy L Thompson std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); 66574398b5aSJeremy L Thompson std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 6669123fb08SJeremy L Thompson 667*0183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q" 66874398b5aSJeremy L Thompson << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 6694b3e95d5SJeremy L Thompson } else { 6709123fb08SJeremy L Thompson std::string function_name = "GradTransposeNonTensor"; 6719123fb08SJeremy L Thompson 672*0183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name 67374398b5aSJeremy L Thompson << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 6744b3e95d5SJeremy L Thompson } 6754b3e95d5SJeremy L Thompson break; 6764b3e95d5SJeremy L Thompson // LCOV_EXCL_START 6774b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 6784b3e95d5SJeremy L Thompson break; // Should not occur 6794b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 6804b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 6814b3e95d5SJeremy L Thompson break; // TODO: Not implemented 6824b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 6834b3e95d5SJeremy L Thompson } 6844b3e95d5SJeremy L Thompson } 6853a2968d6SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 6864b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 6874b3e95d5SJeremy L Thompson } 6884b3e95d5SJeremy L Thompson 6894b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 6904b3e95d5SJeremy L Thompson // QFunction 6914b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 692*0183ed61SJeremy L Thompson static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim, 693*0183ed61SJeremy L Thompson CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 694*0183ed61SJeremy L Thompson CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, 695*0183ed61SJeremy L Thompson CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields, 696*0183ed61SJeremy L Thompson std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points, 697*0183ed61SJeremy L Thompson bool use_3d_slices) { 69874398b5aSJeremy L Thompson std::string Q_name = is_all_tensor ? "Q_1d" : "Q"; 6994b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 7004b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 7014b3e95d5SJeremy L Thompson 7028b97b69aSJeremy L Thompson // Setup output arrays 703*0183ed61SJeremy L Thompson code << "\n"; 704*0183ed61SJeremy L Thompson code << tab << "// -- Output field setup\n"; 7054b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 70659fa3f92SJeremy L Thompson const char *field_name; 7074b3e95d5SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 7084b3e95d5SJeremy L Thompson 70959fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 710*0183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 7114b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 7123a2968d6SJeremy L Thompson switch (eval_mode) { 7133a2968d6SJeremy L Thompson case CEED_EVAL_NONE: 7143a2968d6SJeremy L Thompson if (is_at_points) { 715*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n"; 7163a2968d6SJeremy L Thompson } else { 717*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") 71874398b5aSJeremy L Thompson << "];\n"; 7194b3e95d5SJeremy L Thompson } 7203a2968d6SJeremy L Thompson break; 7213a2968d6SJeremy L Thompson case CEED_EVAL_INTERP: 7223a2968d6SJeremy L Thompson if (is_at_points) { 7233a2968d6SJeremy L Thompson // Accumulator for point data 724*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; 725*0183ed61SJeremy L Thompson code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix 726b8245c6cSJeremy L Thompson << "[i] = 0.0;\n"; 7273a2968d6SJeremy L Thompson } else { 728*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") 72974398b5aSJeremy L Thompson << "];\n"; 7303a2968d6SJeremy L Thompson } 7313a2968d6SJeremy L Thompson break; 7323a2968d6SJeremy L Thompson case CEED_EVAL_GRAD: 7333a2968d6SJeremy L Thompson if (is_at_points) { 7343a2968d6SJeremy L Thompson // Accumulator for point data 735*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; 736*0183ed61SJeremy L Thompson code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix 737b8245c6cSJeremy L Thompson << "[i] = 0.0;\n"; 7383a2968d6SJeremy L Thompson } else if (use_3d_slices) { 7394b3e95d5SJeremy L Thompson // Accumulator for gradient slices 740*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 741*0183ed61SJeremy L Thompson code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n"; 7424b3e95d5SJeremy L Thompson } else { 743*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" 74474398b5aSJeremy L Thompson << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n"; 7454b3e95d5SJeremy L Thompson } 7463a2968d6SJeremy L Thompson break; 7473a2968d6SJeremy L Thompson case CEED_EVAL_WEIGHT: 7483a2968d6SJeremy L Thompson break; 7493a2968d6SJeremy L Thompson // LCOV_EXCL_START 7503a2968d6SJeremy L Thompson case CEED_EVAL_DIV: 7513a2968d6SJeremy L Thompson case CEED_EVAL_CURL: 7523a2968d6SJeremy L Thompson break; // TODO: Not implemented 7533a2968d6SJeremy L Thompson // LCOV_EXCL_STOP 7544b3e95d5SJeremy L Thompson } 7554b3e95d5SJeremy L Thompson } 7564b3e95d5SJeremy L Thompson 7573a2968d6SJeremy L Thompson if (is_at_points) { 7583a2968d6SJeremy L Thompson // We need to handle batches of points 759*0183ed61SJeremy L Thompson code << "\n"; 760*0183ed61SJeremy L Thompson code << tab << "// Note: Using batches of points\n"; 761*0183ed61SJeremy L Thompson code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n"; 762*0183ed61SJeremy L Thompson code << tab << "#pragma unroll\n"; 763*0183ed61SJeremy L Thompson code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n"; 764*0183ed61SJeremy L Thompson tab.push(); 765*0183ed61SJeremy L Thompson code << tab << "const CeedInt p = i % max_num_points;\n\n"; 7663a2968d6SJeremy L Thompson 767*0183ed61SJeremy L Thompson code << tab << "// -- Coordinates\n"; 768*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_x[max_dim];\n"; 769*0183ed61SJeremy L Thompson code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n"; 7703a2968d6SJeremy L Thompson 771*0183ed61SJeremy L Thompson code << tab << "// -- Input fields\n"; 7723a2968d6SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 77359fa3f92SJeremy L Thompson const char *field_name; 7743a2968d6SJeremy L Thompson std::string var_suffix = "_in_" + std::to_string(i); 775f725b54bSJeremy L Thompson std::string P_name = "P_1d" + var_suffix; 7763a2968d6SJeremy L Thompson 77759fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 778*0183ed61SJeremy L Thompson code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 7793a2968d6SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 7803a2968d6SJeremy L Thompson // Basis action 781*0183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 7823a2968d6SJeremy L Thompson switch (eval_mode) { 7833a2968d6SJeremy L Thompson case CEED_EVAL_NONE: 784*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 785*0183ed61SJeremy L Thompson code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 7863a2968d6SJeremy L Thompson << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 7873a2968d6SJeremy L Thompson break; 7883a2968d6SJeremy L Thompson case CEED_EVAL_INTERP: 789*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 790*0183ed61SJeremy L Thompson code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 79174398b5aSJeremy L Thompson << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; 7923a2968d6SJeremy L Thompson break; 7933a2968d6SJeremy L Thompson case CEED_EVAL_GRAD: 794*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 795*0183ed61SJeremy L Thompson code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 79674398b5aSJeremy L Thompson << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; 7973a2968d6SJeremy L Thompson break; 7983a2968d6SJeremy L Thompson case CEED_EVAL_WEIGHT: 799*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; 800*0183ed61SJeremy L Thompson code << tab << "r_s" << var_suffix << "[0] = 1.0;\n"; 8013a2968d6SJeremy L Thompson break; 8023a2968d6SJeremy L Thompson // LCOV_EXCL_START 8033a2968d6SJeremy L Thompson case CEED_EVAL_DIV: 8043a2968d6SJeremy L Thompson case CEED_EVAL_CURL: 8053a2968d6SJeremy L Thompson break; // TODO: Not implemented 8063a2968d6SJeremy L Thompson // LCOV_EXCL_STOP 8073a2968d6SJeremy L Thompson } 8083a2968d6SJeremy L Thompson } 809*0183ed61SJeremy L Thompson code << "\n"; 810*0183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 8113a2968d6SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 81259fa3f92SJeremy L Thompson const char *field_name; 8133a2968d6SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 8143a2968d6SJeremy L Thompson 81559fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 816*0183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 8173a2968d6SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 8183a2968d6SJeremy L Thompson // Basis action 8193a2968d6SJeremy L Thompson switch (eval_mode) { 8203a2968d6SJeremy L Thompson case CEED_EVAL_NONE: 821*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 8223a2968d6SJeremy L Thompson break; 8233a2968d6SJeremy L Thompson case CEED_EVAL_INTERP: 824*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 8253a2968d6SJeremy L Thompson break; 8263a2968d6SJeremy L Thompson case CEED_EVAL_GRAD: 827*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 8283a2968d6SJeremy L Thompson break; 8293a2968d6SJeremy L Thompson // LCOV_EXCL_START 8303a2968d6SJeremy L Thompson case CEED_EVAL_WEIGHT: 8313a2968d6SJeremy L Thompson break; // Should not occur 8323a2968d6SJeremy L Thompson case CEED_EVAL_DIV: 8333a2968d6SJeremy L Thompson case CEED_EVAL_CURL: 8343a2968d6SJeremy L Thompson break; // TODO: Not implemented 8353a2968d6SJeremy L Thompson // LCOV_EXCL_STOP 8363a2968d6SJeremy L Thompson } 8373a2968d6SJeremy L Thompson } 8383a2968d6SJeremy L Thompson 8393a2968d6SJeremy L Thompson } else if (use_3d_slices) { 8404b3e95d5SJeremy L Thompson // We treat quadrature points per slice in 3d to save registers 841*0183ed61SJeremy L Thompson code << "\n"; 842*0183ed61SJeremy L Thompson code << tab << "// Note: Using planes of 3D elements\n"; 843*0183ed61SJeremy L Thompson code << tab << "#pragma unroll\n"; 844*0183ed61SJeremy L Thompson code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n"; 845*0183ed61SJeremy L Thompson tab.push(); 846*0183ed61SJeremy L Thompson code << tab << "// -- Input fields\n"; 8474b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 84859fa3f92SJeremy L Thompson const char *field_name; 8494b3e95d5SJeremy L Thompson std::string var_suffix = "_in_" + std::to_string(i); 8504b3e95d5SJeremy L Thompson 85159fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 852*0183ed61SJeremy L Thompson code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 8534b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 8544b3e95d5SJeremy L Thompson // Basis action 855*0183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 8564b3e95d5SJeremy L Thompson switch (eval_mode) { 8574b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 8584b3e95d5SJeremy L Thompson bool is_strided; 8594b3e95d5SJeremy L Thompson 860*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 8614b3e95d5SJeremy L Thompson 8624b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 8634b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 8644b3e95d5SJeremy L Thompson if (is_strided) { 8654b3e95d5SJeremy L Thompson bool has_backend_strides; 8664b3e95d5SJeremy L Thompson CeedInt num_elem, elem_size; 8674b3e95d5SJeremy L Thompson 8684b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 8694b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 8704b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 8714b3e95d5SJeremy L Thompson CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 8724b3e95d5SJeremy L Thompson 8734b3e95d5SJeremy L Thompson if (!has_backend_strides) { 8744b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 8754b3e95d5SJeremy L Thompson } 876*0183ed61SJeremy L Thompson code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 877*0183ed61SJeremy L Thompson << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 878*0183ed61SJeremy L Thompson code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides" 879*0183ed61SJeremy L Thompson << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n"; 8804b3e95d5SJeremy L Thompson } else { 8814b3e95d5SJeremy L Thompson CeedSize l_size = 0; 8824b3e95d5SJeremy L Thompson CeedInt comp_stride; 8834b3e95d5SJeremy L Thompson CeedElemRestriction_Hip *rstr_data; 8844b3e95d5SJeremy L Thompson 8854b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 886*0183ed61SJeremy L Thompson code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 8874b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 888*0183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 8894b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 8904b3e95d5SJeremy L Thompson data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 891*0183ed61SJeremy L Thompson code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size" 892*0183ed61SJeremy L Thompson << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 8934b3e95d5SJeremy L Thompson } 8949123fb08SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 8954b3e95d5SJeremy L Thompson break; 8964b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 897*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 898*0183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n"; 899*0183ed61SJeremy L Thompson tab.push(); 900*0183ed61SJeremy L Thompson code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n"; 901*0183ed61SJeremy L Thompson tab.pop(); 902*0183ed61SJeremy L Thompson code << tab << "}\n"; 9034b3e95d5SJeremy L Thompson break; 9044b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 905*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 906*0183ed61SJeremy L Thompson code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G" 9076b92dc4bSJeremy L Thompson << var_suffix << ", r_s" << var_suffix << ");\n"; 9084b3e95d5SJeremy L Thompson break; 9094b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 910*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; 911*0183ed61SJeremy L Thompson code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n"; 9123a2968d6SJeremy L Thompson break; 9134b3e95d5SJeremy L Thompson // LCOV_EXCL_START 9144b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 9154b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 9164b3e95d5SJeremy L Thompson break; // TODO: Not implemented 9174b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 9184b3e95d5SJeremy L Thompson } 9194b3e95d5SJeremy L Thompson } 920*0183ed61SJeremy L Thompson code << "\n"; 921*0183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 9224b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 92359fa3f92SJeremy L Thompson const char *field_name; 9244b3e95d5SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 9254b3e95d5SJeremy L Thompson 92659fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 927*0183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 9284b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 9294b3e95d5SJeremy L Thompson // Basis action 9304b3e95d5SJeremy L Thompson switch (eval_mode) { 9314b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 932*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 9333a2968d6SJeremy L Thompson break; 9344b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 935*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 9364b3e95d5SJeremy L Thompson break; 9374b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 938*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 9394b3e95d5SJeremy L Thompson break; 9404b3e95d5SJeremy L Thompson // LCOV_EXCL_START 9414b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 9424b3e95d5SJeremy L Thompson break; // Should not occur 9434b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 9444b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 9454b3e95d5SJeremy L Thompson break; // TODO: Not implemented 9464b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 9474b3e95d5SJeremy L Thompson } 9484b3e95d5SJeremy L Thompson } 9494b3e95d5SJeremy L Thompson } else { 950*0183ed61SJeremy L Thompson code << "\n"; 951*0183ed61SJeremy L Thompson code << tab << "// Note: Using full elements\n"; 952*0183ed61SJeremy L Thompson code << tab << "{\n"; 953*0183ed61SJeremy L Thompson tab.push(); 954*0183ed61SJeremy L Thompson code << tab << "// -- Input fields\n"; 9554b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 95659fa3f92SJeremy L Thompson const char *field_name; 95759fa3f92SJeremy L Thompson 95859fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 959*0183ed61SJeremy L Thompson code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 960*0183ed61SJeremy L Thompson code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n"; 9614b3e95d5SJeremy L Thompson } 962*0183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 9634b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 96459fa3f92SJeremy L Thompson const char *field_name; 96559fa3f92SJeremy L Thompson 96659fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 967*0183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 968*0183ed61SJeremy L Thompson code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n"; 9694b3e95d5SJeremy L Thompson } 9704b3e95d5SJeremy L Thompson } 9714b3e95d5SJeremy L Thompson 9724b3e95d5SJeremy L Thompson // Input and output buffers 973*0183ed61SJeremy L Thompson code << "\n"; 974*0183ed61SJeremy L Thompson code << tab << "// -- QFunction inputs and outputs\n"; 975*0183ed61SJeremy L Thompson code << tab << "// ---- Inputs\n"; 976*0183ed61SJeremy L Thompson code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; 9774b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 97859fa3f92SJeremy L Thompson const char *field_name; 97959fa3f92SJeremy L Thompson 98059fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 981*0183ed61SJeremy L Thompson code << tab << "// ------ Input field " << i << ": " << field_name << "\n"; 982*0183ed61SJeremy L Thompson code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n"; 9834b3e95d5SJeremy L Thompson } 984*0183ed61SJeremy L Thompson code << tab << "// ---- Outputs\n"; 985*0183ed61SJeremy L Thompson code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; 9864b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 98759fa3f92SJeremy L Thompson const char *field_name; 98859fa3f92SJeremy L Thompson 98959fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 990*0183ed61SJeremy L Thompson code << tab << "// ------ Output field " << i << ": " << field_name << "\n"; 991*0183ed61SJeremy L Thompson code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n"; 9924b3e95d5SJeremy L Thompson } 9934b3e95d5SJeremy L Thompson 9944b3e95d5SJeremy L Thompson // Apply QFunction 995*0183ed61SJeremy L Thompson code << "\n"; 996*0183ed61SJeremy L Thompson code << tab << "// -- Apply QFunction\n"; 997*0183ed61SJeremy L Thompson code << tab << "" << qfunction_name << "(ctx, "; 99874398b5aSJeremy L Thompson if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 9994b3e95d5SJeremy L Thompson code << "1"; 10004b3e95d5SJeremy L Thompson } else { 10019123fb08SJeremy L Thompson code << Q_name; 10024b3e95d5SJeremy L Thompson } 10034b3e95d5SJeremy L Thompson code << ", inputs, outputs);\n"; 10044b3e95d5SJeremy L Thompson 10053a2968d6SJeremy L Thompson if (is_at_points) { 10063a2968d6SJeremy L Thompson // Map back to coefficients 1007*0183ed61SJeremy L Thompson code << "\n"; 1008*0183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 10093a2968d6SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 101059fa3f92SJeremy L Thompson const char *field_name; 10113a2968d6SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 10123a2968d6SJeremy L Thompson std::string P_name = "P_1d" + var_suffix; 10133a2968d6SJeremy L Thompson 101459fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1015*0183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 10163a2968d6SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 10173a2968d6SJeremy L Thompson // Basis action 1018*0183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 10193a2968d6SJeremy L Thompson switch (eval_mode) { 10203a2968d6SJeremy L Thompson case CEED_EVAL_NONE: { 10213a2968d6SJeremy L Thompson CeedInt comp_stride; 10223a2968d6SJeremy L Thompson CeedElemRestriction elem_rstr; 10233a2968d6SJeremy L Thompson 10243a2968d6SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 10253a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 10263a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1027*0183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 1028*0183ed61SJeremy L Thompson code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 10293a2968d6SJeremy L Thompson << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]" 10303a2968d6SJeremy L Thompson << ", r_s" << var_suffix << ", d" << var_suffix << ");\n"; 10313a2968d6SJeremy L Thompson break; 10323a2968d6SJeremy L Thompson } 10333a2968d6SJeremy L Thompson case CEED_EVAL_INTERP: 1034*0183ed61SJeremy L Thompson code << tab << "if (i >= points.num_per_elem[elem]) {\n"; 1035*0183ed61SJeremy L Thompson tab.push(); 1036*0183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 1037*0183ed61SJeremy L Thompson tab.pop(); 1038*0183ed61SJeremy L Thompson code << tab << "}\n"; 1039*0183ed61SJeremy L Thompson code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 1040f725b54bSJeremy L Thompson << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 10413a2968d6SJeremy L Thompson break; 10423a2968d6SJeremy L Thompson case CEED_EVAL_GRAD: 1043*0183ed61SJeremy L Thompson code << tab << "if (i >= points.num_per_elem[elem]) {\n"; 1044*0183ed61SJeremy L Thompson tab.push(); 1045*0183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 1046*0183ed61SJeremy L Thompson tab.pop(); 1047*0183ed61SJeremy L Thompson code << tab << "}\n"; 1048*0183ed61SJeremy L Thompson code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 1049f725b54bSJeremy L Thompson << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 10503a2968d6SJeremy L Thompson break; 10513a2968d6SJeremy L Thompson // LCOV_EXCL_START 10523a2968d6SJeremy L Thompson case CEED_EVAL_WEIGHT: 10533a2968d6SJeremy L Thompson break; // Should not occur 10543a2968d6SJeremy L Thompson case CEED_EVAL_DIV: 10553a2968d6SJeremy L Thompson case CEED_EVAL_CURL: 10563a2968d6SJeremy L Thompson break; // TODO: Not implemented 10573a2968d6SJeremy L Thompson // LCOV_EXCL_STOP 10583a2968d6SJeremy L Thompson } 10593a2968d6SJeremy L Thompson } 10603a2968d6SJeremy L Thompson } else if (use_3d_slices) { 10614b3e95d5SJeremy L Thompson // Copy or apply transpose grad, if needed 1062*0183ed61SJeremy L Thompson code << "\n"; 1063*0183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 10644b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 106559fa3f92SJeremy L Thompson const char *field_name; 10664b3e95d5SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 10674b3e95d5SJeremy L Thompson std::string P_name = "P_1d" + var_suffix; 10684b3e95d5SJeremy L Thompson 106959fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1070*0183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 10714b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 10724b3e95d5SJeremy L Thompson // Basis action 1073*0183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 10744b3e95d5SJeremy L Thompson switch (eval_mode) { 10754b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 1076*0183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 1077*0183ed61SJeremy L Thompson tab.push(); 1078*0183ed61SJeremy L Thompson code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 1079*0183ed61SJeremy L Thompson tab.pop(); 1080*0183ed61SJeremy L Thompson code << tab << "}\n"; 10813a2968d6SJeremy L Thompson break; 10824b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 1083*0183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 1084*0183ed61SJeremy L Thompson tab.push(); 1085*0183ed61SJeremy L Thompson code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 1086*0183ed61SJeremy L Thompson tab.pop(); 1087*0183ed61SJeremy L Thompson code << tab << "}\n"; 10884b3e95d5SJeremy L Thompson break; 10894b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 1090*0183ed61SJeremy L Thompson code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G" 1091f815fac9SJeremy L Thompson << var_suffix << ", r_q" << var_suffix << ");\n"; 10924b3e95d5SJeremy L Thompson break; 10934b3e95d5SJeremy L Thompson // LCOV_EXCL_START 10944b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 10954b3e95d5SJeremy L Thompson break; // Should not occur 10964b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 10974b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 10984b3e95d5SJeremy L Thompson break; // TODO: Not implemented 10994b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 11004b3e95d5SJeremy L Thompson } 11014b3e95d5SJeremy L Thompson } 11024b3e95d5SJeremy L Thompson } 1103*0183ed61SJeremy L Thompson tab.pop(); 1104*0183ed61SJeremy L Thompson code << tab << "}\n"; 11054b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 11064b3e95d5SJeremy L Thompson } 11074b3e95d5SJeremy L Thompson 11084b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 11099e201c85SYohann // Build single operator kernel 11107d8d0e25Snbeams //------------------------------------------------------------------------------ 11118d12f40eSJeremy L Thompson extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) { 111274398b5aSJeremy L Thompson bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; 11137d8d0e25Snbeams Ceed ceed; 1114efa41df3SJeremy L Thompson CeedInt Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; 1115b7453713SJeremy L Thompson CeedQFunctionField *qf_input_fields, *qf_output_fields; 1116b7453713SJeremy L Thompson CeedQFunction_Hip_gen *qf_data; 1117b7453713SJeremy L Thompson CeedQFunction qf; 1118b7453713SJeremy L Thompson CeedOperatorField *op_input_fields, *op_output_fields; 1119b7453713SJeremy L Thompson CeedOperator_Hip_gen *data; 11204b3e95d5SJeremy L Thompson std::ostringstream code; 1121*0183ed61SJeremy L Thompson Tab tab; 11224b3e95d5SJeremy L Thompson 11238d12f40eSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &data)); 11244b3e95d5SJeremy L Thompson { 11254b3e95d5SJeremy L Thompson bool is_setup_done; 1126b7453713SJeremy L Thompson 1127b7453713SJeremy L Thompson CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 11288d12f40eSJeremy L Thompson if (is_setup_done) { 11298d12f40eSJeremy L Thompson *is_good_build = !data->use_fallback; 11308d12f40eSJeremy L Thompson return CEED_ERROR_SUCCESS; 11318d12f40eSJeremy L Thompson } 11324b3e95d5SJeremy L Thompson } 1133b7453713SJeremy L Thompson 11348d12f40eSJeremy L Thompson // Check field compatibility 11358d12f40eSJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 11368d12f40eSJeremy L Thompson { 113774398b5aSJeremy L Thompson bool has_shared_bases = true; 11388d12f40eSJeremy L Thompson 11398d12f40eSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 11408d12f40eSJeremy L Thompson CeedBasis basis; 11418d12f40eSJeremy L Thompson 11428d12f40eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 11438d12f40eSJeremy L Thompson if (basis != CEED_BASIS_NONE) { 11448d12f40eSJeremy L Thompson bool is_tensor = true; 11458d12f40eSJeremy L Thompson const char *resource; 11468d12f40eSJeremy L Thompson char *resource_root; 11478d12f40eSJeremy L Thompson Ceed basis_ceed; 11488d12f40eSJeremy L Thompson 11498d12f40eSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 1150c9192acaSJeremy L Thompson is_all_tensor = is_all_tensor && is_tensor; 1151c9192acaSJeremy L Thompson is_all_nontensor = is_all_nontensor && !is_tensor; 11528d12f40eSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 11538d12f40eSJeremy L Thompson CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 11548d12f40eSJeremy L Thompson CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 1155c9192acaSJeremy L Thompson has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 11568d12f40eSJeremy L Thompson CeedCallBackend(CeedFree(&resource_root)); 11578d12f40eSJeremy L Thompson CeedCallBackend(CeedDestroy(&basis_ceed)); 11588d12f40eSJeremy L Thompson } 11598d12f40eSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 11608d12f40eSJeremy L Thompson } 11618d12f40eSJeremy L Thompson 11628d12f40eSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 11638d12f40eSJeremy L Thompson CeedBasis basis; 11648d12f40eSJeremy L Thompson 11658d12f40eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 11668d12f40eSJeremy L Thompson if (basis != CEED_BASIS_NONE) { 11678d12f40eSJeremy L Thompson bool is_tensor = true; 11688d12f40eSJeremy L Thompson const char *resource; 11698d12f40eSJeremy L Thompson char *resource_root; 11708d12f40eSJeremy L Thompson Ceed basis_ceed; 11718d12f40eSJeremy L Thompson 11728d12f40eSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 1173c9192acaSJeremy L Thompson is_all_tensor = is_all_tensor && is_tensor; 1174c9192acaSJeremy L Thompson is_all_nontensor = is_all_nontensor && !is_tensor; 11758d12f40eSJeremy L Thompson 11768d12f40eSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 11778d12f40eSJeremy L Thompson CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 11788d12f40eSJeremy L Thompson CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 1179c9192acaSJeremy L Thompson has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); 11808d12f40eSJeremy L Thompson CeedCallBackend(CeedFree(&resource_root)); 11818d12f40eSJeremy L Thompson CeedCallBackend(CeedDestroy(&basis_ceed)); 11828d12f40eSJeremy L Thompson } 11838d12f40eSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 11848d12f40eSJeremy L Thompson } 11858d12f40eSJeremy L Thompson // -- Fallback to ref if not all bases are shared 118674398b5aSJeremy L Thompson if (!has_shared_bases) { 11878d12f40eSJeremy L Thompson *is_good_build = false; 11888d12f40eSJeremy L Thompson return CEED_ERROR_SUCCESS; 11898d12f40eSJeremy L Thompson } 11908d12f40eSJeremy L Thompson } 1191b7453713SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1192b7453713SJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1193b7453713SJeremy L Thompson CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 1194b7453713SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 11957d8d0e25Snbeams 11964b3e95d5SJeremy L Thompson // Get operator data 11973a2968d6SJeremy L Thompson CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 119874398b5aSJeremy L Thompson { 1199efa41df3SJeremy L Thompson CeedInt max_P = 0, max_P_1d = 0; 120074398b5aSJeremy L Thompson 12014b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields, 120274398b5aSJeremy L Thompson qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices)); 120374398b5aSJeremy L Thompson data->max_P_1d = is_all_tensor ? max_P_1d : max_P; 120474398b5aSJeremy L Thompson } 120574398b5aSJeremy L Thompson if (max_dim == 0) max_dim = 1; 120674398b5aSJeremy L Thompson data->dim = max_dim; 12073a2968d6SJeremy L Thompson if (is_at_points) { 12083a2968d6SJeremy L Thompson CeedElemRestriction_Hip *rstr_data; 12093a2968d6SJeremy L Thompson CeedElemRestriction rstr_points = NULL; 12104b3e95d5SJeremy L Thompson 12113a2968d6SJeremy L Thompson CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 12123a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 12133a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 12143a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data)); 12153a2968d6SJeremy L Thompson data->points.indices = (CeedInt *)rstr_data->d_offsets; 12163a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 12173a2968d6SJeremy L Thompson } 12183a2968d6SJeremy L Thompson if (is_at_points) use_3d_slices = false; 12193a2968d6SJeremy L Thompson if (Q_1d == 0) { 12203a2968d6SJeremy L Thompson if (is_at_points) Q_1d = max_num_points; 12213a2968d6SJeremy L Thompson else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d)); 12224b3e95d5SJeremy L Thompson } 122374398b5aSJeremy L Thompson if (Q == 0) Q = Q_1d; 122474398b5aSJeremy L Thompson data->Q = Q; 12254b3e95d5SJeremy L Thompson data->Q_1d = Q_1d; 12264b3e95d5SJeremy L Thompson 12270b454692Sjeremylt // Check for restriction only identity operator 12284b3e95d5SJeremy L Thompson { 12294b3e95d5SJeremy L Thompson bool is_identity_qf; 12304b3e95d5SJeremy L Thompson 12312b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 12320b454692Sjeremylt if (is_identity_qf) { 12339e201c85SYohann CeedEvalMode eval_mode_in, eval_mode_out; 1234b7453713SJeremy L Thompson 12352b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); 12362b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); 12376574a04fSJeremy L Thompson CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, 12386574a04fSJeremy L Thompson "Backend does not implement restriction only identity operators"); 12390b454692Sjeremylt } 12404b3e95d5SJeremy L Thompson } 1241b2165e7aSSebastian Grimberg 1242b2165e7aSSebastian Grimberg // Load basis source files 1243eaf9ad10SZach Atkins if (!is_all_nontensor) { 1244*0183ed61SJeremy L Thompson code << tab << "// Tensor basis source\n"; 1245*0183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 124674398b5aSJeremy L Thompson } 124774398b5aSJeremy L Thompson if (!is_all_tensor) { 1248*0183ed61SJeremy L Thompson code << tab << "// Non-tensor basis source\n"; 1249*0183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n"; 12509123fb08SJeremy L Thompson } 12519123fb08SJeremy L Thompson if (is_at_points) { 1252*0183ed61SJeremy L Thompson code << tab << "// AtPoints basis source\n"; 1253*0183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n"; 12549123fb08SJeremy L Thompson } 125574398b5aSJeremy L Thompson if (!is_all_tensor && !is_all_nontensor) { 1256*0183ed61SJeremy L Thompson code << tab << "// Tensor basis source\n"; 1257*0183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n"; 125874398b5aSJeremy L Thompson } 1259*0183ed61SJeremy L Thompson code << tab << "// CodeGen operator source\n"; 1260*0183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 12617d8d0e25Snbeams 12624b3e95d5SJeremy L Thompson // Get QFunction name 12634b3e95d5SJeremy L Thompson std::string qfunction_name(qf_data->qfunction_name); 12644b3e95d5SJeremy L Thompson std::string operator_name; 12654b3e95d5SJeremy L Thompson 126609095acaSJeremy L Thompson operator_name = "CeedKernelHipGenOperator_" + qfunction_name; 12677d8d0e25Snbeams 12689e201c85SYohann // Define CEED_Q_VLA 1269*0183ed61SJeremy L Thompson code << "\n" << tab << "#undef CEED_Q_VLA\n"; 127074398b5aSJeremy L Thompson if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 1271*0183ed61SJeremy L Thompson code << tab << "#define CEED_Q_VLA 1\n\n"; 12729e201c85SYohann } else { 1273*0183ed61SJeremy L Thompson code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 12749e201c85SYohann } 12759e201c85SYohann 12764b3e95d5SJeremy L Thompson // Add user QFunction source 12774b3e95d5SJeremy L Thompson { 12789c25dd66SJeremy L Thompson const char *source_path; 12794b3e95d5SJeremy L Thompson 12809c25dd66SJeremy L Thompson CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 12819c25dd66SJeremy L Thompson CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 12829c25dd66SJeremy L Thompson 1283*0183ed61SJeremy L Thompson code << tab << "// User QFunction source\n"; 1284*0183ed61SJeremy L Thompson code << tab << "#include \"" << source_path << "\"\n\n"; 12854b3e95d5SJeremy L Thompson } 12867d8d0e25Snbeams 12877d8d0e25Snbeams // Setup 1288*0183ed61SJeremy L Thompson code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 1289*0183ed61SJeremy L Thompson code << tab << "// Operator Kernel\n"; 1290*0183ed61SJeremy L Thompson code << tab << "// \n"; 1291*0183ed61SJeremy L Thompson code << tab << "// d_[in,out]_i: CeedVector device array\n"; 1292*0183ed61SJeremy L Thompson code << tab << "// r_[in,out]_e_i: Element vector register\n"; 1293*0183ed61SJeremy L Thompson code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 1294*0183ed61SJeremy L Thompson code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 1295*0183ed61SJeremy L Thompson code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 1296*0183ed61SJeremy L Thompson code << tab << "// \n"; 1297*0183ed61SJeremy L Thompson code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 1298*0183ed61SJeremy L Thompson code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 1299*0183ed61SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n"; 1300*0183ed61SJeremy L Thompson code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; 13012b730f8bSJeremy L Thompson code << "__global__ void " << operator_name 13023a2968d6SJeremy L Thompson << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n"; 1303*0183ed61SJeremy L Thompson tab.push(); 13044b3e95d5SJeremy L Thompson 13054b3e95d5SJeremy L Thompson // Scratch buffers 13069e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 13074b3e95d5SJeremy L Thompson CeedEvalMode eval_mode; 13084b3e95d5SJeremy L Thompson 13092b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 13109e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 1311*0183ed61SJeremy L Thompson code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 13127d8d0e25Snbeams } 13137d8d0e25Snbeams } 13149e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 1315*0183ed61SJeremy L Thompson code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 13167d8d0e25Snbeams } 13177d8d0e25Snbeams 1318*0183ed61SJeremy L Thompson code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 131974398b5aSJeremy L Thompson if (!is_all_tensor) { 1320*0183ed61SJeremy L Thompson code << tab << "const CeedInt Q = " << Q << ";\n"; 132174398b5aSJeremy L Thompson } 132274398b5aSJeremy L Thompson if (!is_all_nontensor) { 1323*0183ed61SJeremy L Thompson code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 132474398b5aSJeremy L Thompson } 13253a2968d6SJeremy L Thompson if (is_at_points) { 1326*0183ed61SJeremy L Thompson code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; 1327*0183ed61SJeremy L Thompson code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 13283a2968d6SJeremy L Thompson } 13297d8d0e25Snbeams 13304b3e95d5SJeremy L Thompson // Shared data 1331*0183ed61SJeremy L Thompson code << tab << "extern __shared__ CeedScalar slice[];\n"; 1332*0183ed61SJeremy L Thompson code << tab << "SharedData_Hip data;\n"; 1333*0183ed61SJeremy L Thompson code << tab << "data.t_id_x = threadIdx.x;\n"; 1334*0183ed61SJeremy L Thompson code << tab << "data.t_id_y = threadIdx.y;\n"; 1335*0183ed61SJeremy L Thompson code << tab << "data.t_id_z = threadIdx.z;\n"; 1336*0183ed61SJeremy L Thompson code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 1337*0183ed61SJeremy L Thompson code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 13387d8d0e25Snbeams 13399ee499e5SJeremy L Thompson // -- Determine input mat reuse 134045a787f7SJeremy L Thompson FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 13419ee499e5SJeremy L Thompson 13429ee499e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 134345a787f7SJeremy L Thompson input_matrix_reuse[i].index = -1; 13449ee499e5SJeremy L Thompson } 13459ee499e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 134674398b5aSJeremy L Thompson bool is_tensor = true; 13479ee499e5SJeremy L Thompson CeedEvalMode eval_mode_i; 13489ee499e5SJeremy L Thompson CeedBasis basis_i; 13499ee499e5SJeremy L Thompson 13509ee499e5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 13519ee499e5SJeremy L Thompson if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 13529ee499e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 135374398b5aSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 135445a787f7SJeremy L Thompson for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 13559ee499e5SJeremy L Thompson CeedEvalMode eval_mode_j; 13569ee499e5SJeremy L Thompson CeedBasis basis_j; 13579ee499e5SJeremy L Thompson 13589ee499e5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 13599ee499e5SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 13609ee499e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 13619ee499e5SJeremy L Thompson if (basis_i == basis_j) { 13629ee499e5SJeremy L Thompson if (is_tensor) { 136345a787f7SJeremy L Thompson input_matrix_reuse[i].index = j; 136445a787f7SJeremy L Thompson input_matrix_reuse[i].is_input = true; 136545a787f7SJeremy L Thompson input_matrix_reuse[i].eval_mode = eval_mode_j; 13669ee499e5SJeremy L Thompson } else { 13679ee499e5SJeremy L Thompson // For non-tensor can only re-use with the same eval mode 13689ee499e5SJeremy L Thompson if (eval_mode_i == eval_mode_j) { 136945a787f7SJeremy L Thompson input_matrix_reuse[i].index = j; 137045a787f7SJeremy L Thompson input_matrix_reuse[i].is_input = true; 137145a787f7SJeremy L Thompson input_matrix_reuse[i].eval_mode = eval_mode_j; 13729ee499e5SJeremy L Thompson } 13739ee499e5SJeremy L Thompson } 13749ee499e5SJeremy L Thompson } 13759ee499e5SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 13769ee499e5SJeremy L Thompson } 13779ee499e5SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 13789ee499e5SJeremy L Thompson } 13799ee499e5SJeremy L Thompson 13809ee499e5SJeremy L Thompson // -- Determine output mat reuse 138145a787f7SJeremy L Thompson FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 13829ee499e5SJeremy L Thompson 13839ee499e5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 138445a787f7SJeremy L Thompson output_matrix_reuse[i].index = -1; 13859ee499e5SJeremy L Thompson } 13869ee499e5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 138774398b5aSJeremy L Thompson bool is_tensor = true; 13889ee499e5SJeremy L Thompson CeedEvalMode eval_mode_i; 13899ee499e5SJeremy L Thompson CeedBasis basis_i; 13909ee499e5SJeremy L Thompson 13919ee499e5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 13929ee499e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 139345a787f7SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 13949ee499e5SJeremy L Thompson CeedEvalMode eval_mode_j; 13959ee499e5SJeremy L Thompson CeedBasis basis_j; 13969ee499e5SJeremy L Thompson 13979ee499e5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 13989ee499e5SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 13999ee499e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 14009ee499e5SJeremy L Thompson if (basis_i == basis_j) { 14019ee499e5SJeremy L Thompson if (is_tensor) { 140245a787f7SJeremy L Thompson output_matrix_reuse[i].index = j; 140345a787f7SJeremy L Thompson output_matrix_reuse[i].is_input = true; 140445a787f7SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 14059ee499e5SJeremy L Thompson } else { 14069ee499e5SJeremy L Thompson // For non-tensor can only re-use with the same eval mode 14079ee499e5SJeremy L Thompson if (eval_mode_i == eval_mode_j) { 140845a787f7SJeremy L Thompson output_matrix_reuse[i].index = j; 140945a787f7SJeremy L Thompson output_matrix_reuse[i].is_input = true; 141045a787f7SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 14119ee499e5SJeremy L Thompson } 14129ee499e5SJeremy L Thompson } 14139ee499e5SJeremy L Thompson } 14149ee499e5SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 14159ee499e5SJeremy L Thompson } 141645a787f7SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 14179ee499e5SJeremy L Thompson CeedEvalMode eval_mode_j; 14189ee499e5SJeremy L Thompson CeedBasis basis_j; 14199ee499e5SJeremy L Thompson 14209ee499e5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 14219ee499e5SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 14229ee499e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 142374398b5aSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 14249ee499e5SJeremy L Thompson if (basis_i == basis_j) { 14259ee499e5SJeremy L Thompson if (is_tensor) { 142645a787f7SJeremy L Thompson output_matrix_reuse[i].index = j; 142745a787f7SJeremy L Thompson output_matrix_reuse[i].is_input = false; 142845a787f7SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 14299ee499e5SJeremy L Thompson } else { 14309ee499e5SJeremy L Thompson // For non-tensor can only re-use with the same eval mode 14319ee499e5SJeremy L Thompson if (eval_mode_i == eval_mode_j) { 143245a787f7SJeremy L Thompson output_matrix_reuse[i].index = j; 143345a787f7SJeremy L Thompson output_matrix_reuse[i].is_input = false; 143445a787f7SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 14359ee499e5SJeremy L Thompson } 14369ee499e5SJeremy L Thompson } 14379ee499e5SJeremy L Thompson } 14389ee499e5SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 14399ee499e5SJeremy L Thompson } 14409ee499e5SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 14419ee499e5SJeremy L Thompson } 14429ee499e5SJeremy L Thompson 14437d8d0e25Snbeams // Initialize constants, and matrices B and G 1444*0183ed61SJeremy L Thompson code << "\n" << tab << "// Input field constants and basis data\n"; 14459e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 1446*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 1447*0183ed61SJeremy L Thompson max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 14487d8d0e25Snbeams } 1449*0183ed61SJeremy L Thompson code << "\n" << tab << "// Output field constants and basis data\n"; 14509e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 1451*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 1452*0183ed61SJeremy L Thompson max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices)); 14534b3e95d5SJeremy L Thompson } 14547d8d0e25Snbeams 14554b3e95d5SJeremy L Thompson // Loop over all elements 1456*0183ed61SJeremy L Thompson code << "\n" << tab << "// Element loop\n"; 1457*0183ed61SJeremy L Thompson code << tab << "__syncthreads();\n"; 1458*0183ed61SJeremy L Thompson code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 1459*0183ed61SJeremy L Thompson tab.push(); 14604b3e95d5SJeremy L Thompson 1461e93651e5SJeremy L Thompson // -- Compute minimum buffer space needed 14623a2968d6SJeremy L Thompson CeedInt max_rstr_buffer_size = 1; 1463e93651e5SJeremy L Thompson 1464e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 14656de40545SJeremy L Thompson CeedEvalMode eval_mode; 14666de40545SJeremy L Thompson 14676de40545SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 14686de40545SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 1469a61b1c91SJeremy L Thompson CeedInt num_comp; 1470e93651e5SJeremy L Thompson CeedElemRestriction elem_rstr; 1471e93651e5SJeremy L Thompson 1472e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 1473e93651e5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1474a61b1c91SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1475681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1476e93651e5SJeremy L Thompson } 14776de40545SJeremy L Thompson } 1478e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 14796de40545SJeremy L Thompson CeedEvalMode eval_mode; 14806de40545SJeremy L Thompson 14816de40545SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 14826de40545SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE) { 1483a61b1c91SJeremy L Thompson CeedInt num_comp; 1484e93651e5SJeremy L Thompson CeedElemRestriction elem_rstr; 1485e93651e5SJeremy L Thompson 1486e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1487e93651e5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1488a61b1c91SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1489681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1490e93651e5SJeremy L Thompson } 14916de40545SJeremy L Thompson } 1492*0183ed61SJeremy L Thompson code << tab << "// Scratch restriction buffer space\n"; 1493*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 1494e93651e5SJeremy L Thompson 1495e93651e5SJeremy L Thompson // -- Determine best input field processing order 1496e93651e5SJeremy L Thompson CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 1497e93651e5SJeremy L Thompson 1498e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1499e93651e5SJeremy L Thompson field_rstr_in_buffer[i] = -1; 1500e93651e5SJeremy L Thompson input_field_order[i] = -1; 1501e93651e5SJeremy L Thompson } 1502e93651e5SJeremy L Thompson { 1503e93651e5SJeremy L Thompson bool is_ordered[CEED_FIELD_MAX]; 1504e93651e5SJeremy L Thompson CeedInt curr_index = 0; 1505e93651e5SJeremy L Thompson 1506e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 1507e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1508e93651e5SJeremy L Thompson CeedVector vec_i; 1509e93651e5SJeremy L Thompson CeedElemRestriction rstr_i; 1510e93651e5SJeremy L Thompson 1511e93651e5SJeremy L Thompson if (is_ordered[i]) continue; 1512e93651e5SJeremy L Thompson field_rstr_in_buffer[i] = i; 1513e93651e5SJeremy L Thompson is_ordered[i] = true; 1514e93651e5SJeremy L Thompson input_field_order[curr_index] = i; 1515e93651e5SJeremy L Thompson curr_index++; 1516034f99fdSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 1517e93651e5SJeremy L Thompson if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 1518e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 1519e93651e5SJeremy L Thompson for (CeedInt j = i + 1; j < num_input_fields; j++) { 1520e93651e5SJeremy L Thompson CeedVector vec_j; 1521e93651e5SJeremy L Thompson CeedElemRestriction rstr_j; 1522e93651e5SJeremy L Thompson 1523e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 1524e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 1525e93651e5SJeremy L Thompson if (rstr_i == rstr_j && vec_i == vec_j) { 1526e93651e5SJeremy L Thompson field_rstr_in_buffer[j] = i; 1527e93651e5SJeremy L Thompson is_ordered[j] = true; 1528e93651e5SJeremy L Thompson input_field_order[curr_index] = j; 1529e93651e5SJeremy L Thompson curr_index++; 1530e93651e5SJeremy L Thompson } 15313a2968d6SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_j)); 15323a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 1533e93651e5SJeremy L Thompson } 15343a2968d6SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_i)); 15353a2968d6SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 1536e93651e5SJeremy L Thompson } 1537e93651e5SJeremy L Thompson } 1538e93651e5SJeremy L Thompson 15394b3e95d5SJeremy L Thompson // -- Input restriction and basis 1540*0183ed61SJeremy L Thompson code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 15419e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 154259fa3f92SJeremy L Thompson const char *field_name; 154359fa3f92SJeremy L Thompson const CeedInt f = input_field_order[i]; 1544e93651e5SJeremy L Thompson 154559fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 1546*0183ed61SJeremy L Thompson code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 15477d8d0e25Snbeams 15484b3e95d5SJeremy L Thompson // ---- Restriction 1549*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 1550*0183ed61SJeremy L Thompson max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 1551b7453713SJeremy L Thompson 15524b3e95d5SJeremy L Thompson // ---- Basis action 1553*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 1554*0183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 15557d8d0e25Snbeams } 15567d8d0e25Snbeams 15574b3e95d5SJeremy L Thompson // -- Q function 1558*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 1559*0183ed61SJeremy L Thompson qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 1560*0183ed61SJeremy L Thompson Q_1d, is_all_tensor, is_at_points, use_3d_slices)); 15617d8d0e25Snbeams 15624b3e95d5SJeremy L Thompson // -- Output basis and restriction 1563*0183ed61SJeremy L Thompson code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 15649e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 156559fa3f92SJeremy L Thompson const char *field_name; 156659fa3f92SJeremy L Thompson 156759fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1568*0183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1569b7453713SJeremy L Thompson 15704b3e95d5SJeremy L Thompson // ---- Basis action 1571*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, 1572*0183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 15737d8d0e25Snbeams 15744b3e95d5SJeremy L Thompson // ---- Restriction 1575*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, 1576*0183ed61SJeremy L Thompson false, is_all_tensor, is_at_points, use_3d_slices)); 15777d8d0e25Snbeams } 15787d8d0e25Snbeams 15794b3e95d5SJeremy L Thompson // Close loop and function 1580*0183ed61SJeremy L Thompson tab.pop(); 1581*0183ed61SJeremy L Thompson code << tab << "}\n"; 1582*0183ed61SJeremy L Thompson tab.pop(); 1583*0183ed61SJeremy L Thompson code << tab << "}\n"; 1584*0183ed61SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n\n"; 15857d8d0e25Snbeams 1586539ec17dSJeremy L Thompson CeedInt block_sizes[3] = {0, 0, 0}; 15879e201c85SYohann CeedInt num_elem; 1588b7453713SJeremy L Thompson 15893a2968d6SJeremy L Thompson // Compile 15902b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 159174398b5aSJeremy L Thompson CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes)); 159290c30374SJeremy L Thompson if (is_at_points) block_sizes[2] = 1; 15938d12f40eSJeremy L Thompson { 15948d12f40eSJeremy L Thompson bool is_compile_good = false; 15958d12f40eSJeremy L Thompson 1596a61b1c91SJeremy L Thompson data->thread_1d = block_sizes[0]; 15976b92dc4bSJeremy L Thompson CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE", 15982b730f8bSJeremy L Thompson block_sizes[0] * block_sizes[1] * block_sizes[2])); 15998d12f40eSJeremy L Thompson if (is_compile_good) { 16008d12f40eSJeremy L Thompson *is_good_build = true; 1601eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op)); 16028d12f40eSJeremy L Thompson } else { 16038d12f40eSJeremy L Thompson *is_good_build = false; 16048d12f40eSJeremy L Thompson data->use_fallback = true; 16058d12f40eSJeremy L Thompson } 16068d12f40eSJeremy L Thompson } 16072b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetSetupDone(op)); 16089bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 1609c11e12f4SJeremy L Thompson CeedCallBackend(CeedQFunctionDestroy(&qf)); 1610e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 16117d8d0e25Snbeams } 16122a86cc9dSSebastian Grimberg 16137d8d0e25Snbeams //------------------------------------------------------------------------------ 1614*0183ed61SJeremy L Thompson // Build AtPoints assembly operator kernel 1615*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 1616*0183ed61SJeremy L Thompson static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) { 1617*0183ed61SJeremy L Thompson bool is_all_tensor = true, is_at_points = false, use_3d_slices = false; 1618*0183ed61SJeremy L Thompson Ceed ceed; 1619*0183ed61SJeremy L Thompson CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; 1620*0183ed61SJeremy L Thompson CeedQFunctionField *qf_input_fields, *qf_output_fields; 1621*0183ed61SJeremy L Thompson CeedQFunction_Hip_gen *qf_data; 1622*0183ed61SJeremy L Thompson CeedQFunction qf; 1623*0183ed61SJeremy L Thompson CeedOperatorField *op_input_fields, *op_output_fields; 1624*0183ed61SJeremy L Thompson CeedOperator_Hip_gen *data; 1625*0183ed61SJeremy L Thompson std::ostringstream code; 1626*0183ed61SJeremy L Thompson Tab tab; 1627*0183ed61SJeremy L Thompson 1628*0183ed61SJeremy L Thompson // Check compatibility 1629*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1630*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 1631*0183ed61SJeremy L Thompson CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported"); 1632*0183ed61SJeremy L Thompson 1633*0183ed61SJeremy L Thompson // Retrieve operator data 1634*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &data)); 1635*0183ed61SJeremy L Thompson Q = data->Q; 1636*0183ed61SJeremy L Thompson Q_1d = data->Q_1d; 1637*0183ed61SJeremy L Thompson max_dim = data->dim; 1638*0183ed61SJeremy L Thompson { 1639*0183ed61SJeremy L Thompson CeedElemRestriction rstr_points = NULL; 1640*0183ed61SJeremy L Thompson 1641*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 1642*0183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 1643*0183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 1644*0183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 1645*0183ed61SJeremy L Thompson } 1646*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1647*0183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 1648*0183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 1649*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 1650*0183ed61SJeremy L Thompson 1651*0183ed61SJeremy L Thompson // Load basis source files 1652*0183ed61SJeremy L Thompson code << tab << "// Tensor basis source\n"; 1653*0183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n"; 1654*0183ed61SJeremy L Thompson code << tab << "// AtPoints basis source\n"; 1655*0183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n"; 1656*0183ed61SJeremy L Thompson code << tab << "// CodeGen operator source\n"; 1657*0183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n"; 1658*0183ed61SJeremy L Thompson 1659*0183ed61SJeremy L Thompson // Get QFunction name 1660*0183ed61SJeremy L Thompson std::string qfunction_name(qf_data->qfunction_name); 1661*0183ed61SJeremy L Thompson std::string operator_name; 1662*0183ed61SJeremy L Thompson 1663*0183ed61SJeremy L Thompson if (is_full) { 1664*0183ed61SJeremy L Thompson operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name; 1665*0183ed61SJeremy L Thompson } else { 1666*0183ed61SJeremy L Thompson operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name; 1667*0183ed61SJeremy L Thompson } 1668*0183ed61SJeremy L Thompson 1669*0183ed61SJeremy L Thompson // Define CEED_Q_VLA 1670*0183ed61SJeremy L Thompson code << "\n" << tab << "#undef CEED_Q_VLA\n"; 1671*0183ed61SJeremy L Thompson code << tab << "#define CEED_Q_VLA 1\n\n"; 1672*0183ed61SJeremy L Thompson 1673*0183ed61SJeremy L Thompson // Add user QFunction source 1674*0183ed61SJeremy L Thompson { 1675*0183ed61SJeremy L Thompson const char *source_path; 1676*0183ed61SJeremy L Thompson 1677*0183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 1678*0183ed61SJeremy L Thompson CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file"); 1679*0183ed61SJeremy L Thompson 1680*0183ed61SJeremy L Thompson code << tab << "// User QFunction source\n"; 1681*0183ed61SJeremy L Thompson code << tab << "#include \"" << source_path << "\"\n\n"; 1682*0183ed61SJeremy L Thompson } 1683*0183ed61SJeremy L Thompson 1684*0183ed61SJeremy L Thompson // Setup 1685*0183ed61SJeremy L Thompson code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 1686*0183ed61SJeremy L Thompson code << tab << "// Operator Assembly Kernel\n"; 1687*0183ed61SJeremy L Thompson code << tab << "// \n"; 1688*0183ed61SJeremy L Thompson code << tab << "// d_[in,out]_i: CeedVector device array\n"; 1689*0183ed61SJeremy L Thompson code << tab << "// r_[in,out]_e_i: Element vector register\n"; 1690*0183ed61SJeremy L Thompson code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 1691*0183ed61SJeremy L Thompson code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 1692*0183ed61SJeremy L Thompson code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 1693*0183ed61SJeremy L Thompson code << tab << "// \n"; 1694*0183ed61SJeremy L Thompson code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 1695*0183ed61SJeremy L Thompson code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 1696*0183ed61SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n"; 1697*0183ed61SJeremy L Thompson code << tab << "extern \"C\" __global__ void " << operator_name 1698*0183ed61SJeremy L Thompson << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip " 1699*0183ed61SJeremy L Thompson "points, CeedScalar *__restrict__ values_array) {\n"; 1700*0183ed61SJeremy L Thompson tab.push(); 1701*0183ed61SJeremy L Thompson 1702*0183ed61SJeremy L Thompson // Scratch buffers 1703*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1704*0183ed61SJeremy L Thompson CeedEvalMode eval_mode; 1705*0183ed61SJeremy L Thompson 1706*0183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1707*0183ed61SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 1708*0183ed61SJeremy L Thompson code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 1709*0183ed61SJeremy L Thompson } 1710*0183ed61SJeremy L Thompson } 1711*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 1712*0183ed61SJeremy L Thompson code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 1713*0183ed61SJeremy L Thompson } 1714*0183ed61SJeremy L Thompson 1715*0183ed61SJeremy L Thompson code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 1716*0183ed61SJeremy L Thompson code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 1717*0183ed61SJeremy L Thompson code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; 1718*0183ed61SJeremy L Thompson code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 1719*0183ed61SJeremy L Thompson 1720*0183ed61SJeremy L Thompson // Shared data 1721*0183ed61SJeremy L Thompson code << tab << "extern __shared__ CeedScalar slice[];\n"; 1722*0183ed61SJeremy L Thompson code << tab << "SharedData_Hip data;\n"; 1723*0183ed61SJeremy L Thompson code << tab << "data.t_id_x = threadIdx.x;\n"; 1724*0183ed61SJeremy L Thompson code << tab << "data.t_id_y = threadIdx.y;\n"; 1725*0183ed61SJeremy L Thompson code << tab << "data.t_id_z = threadIdx.z;\n"; 1726*0183ed61SJeremy L Thompson code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 1727*0183ed61SJeremy L Thompson code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 1728*0183ed61SJeremy L Thompson 1729*0183ed61SJeremy L Thompson // -- Determine input mat reuse 1730*0183ed61SJeremy L Thompson FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX]; 1731*0183ed61SJeremy L Thompson 1732*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1733*0183ed61SJeremy L Thompson input_matrix_reuse[i].index = -1; 1734*0183ed61SJeremy L Thompson } 1735*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1736*0183ed61SJeremy L Thompson CeedEvalMode eval_mode_i; 1737*0183ed61SJeremy L Thompson CeedBasis basis_i; 1738*0183ed61SJeremy L Thompson 1739*0183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 1740*0183ed61SJeremy L Thompson if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 1741*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 1742*0183ed61SJeremy L Thompson for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 1743*0183ed61SJeremy L Thompson CeedEvalMode eval_mode_j; 1744*0183ed61SJeremy L Thompson CeedBasis basis_j; 1745*0183ed61SJeremy L Thompson 1746*0183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1747*0183ed61SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1748*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1749*0183ed61SJeremy L Thompson if (basis_i == basis_j) { 1750*0183ed61SJeremy L Thompson input_matrix_reuse[i].index = j; 1751*0183ed61SJeremy L Thompson input_matrix_reuse[i].is_input = true; 1752*0183ed61SJeremy L Thompson input_matrix_reuse[i].eval_mode = eval_mode_j; 1753*0183ed61SJeremy L Thompson } 1754*0183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 1755*0183ed61SJeremy L Thompson } 1756*0183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 1757*0183ed61SJeremy L Thompson } 1758*0183ed61SJeremy L Thompson 1759*0183ed61SJeremy L Thompson // -- Determine output mat reuse 1760*0183ed61SJeremy L Thompson FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX]; 1761*0183ed61SJeremy L Thompson 1762*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 1763*0183ed61SJeremy L Thompson output_matrix_reuse[i].index = -1; 1764*0183ed61SJeremy L Thompson } 1765*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 1766*0183ed61SJeremy L Thompson CeedEvalMode eval_mode_i; 1767*0183ed61SJeremy L Thompson CeedBasis basis_i; 1768*0183ed61SJeremy L Thompson 1769*0183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 1770*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 1771*0183ed61SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 1772*0183ed61SJeremy L Thompson CeedEvalMode eval_mode_j; 1773*0183ed61SJeremy L Thompson CeedBasis basis_j; 1774*0183ed61SJeremy L Thompson 1775*0183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 1776*0183ed61SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1777*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 1778*0183ed61SJeremy L Thompson if (basis_i == basis_j) { 1779*0183ed61SJeremy L Thompson output_matrix_reuse[i].index = j; 1780*0183ed61SJeremy L Thompson output_matrix_reuse[i].is_input = true; 1781*0183ed61SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 1782*0183ed61SJeremy L Thompson } 1783*0183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 1784*0183ed61SJeremy L Thompson } 1785*0183ed61SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 1786*0183ed61SJeremy L Thompson CeedEvalMode eval_mode_j; 1787*0183ed61SJeremy L Thompson CeedBasis basis_j; 1788*0183ed61SJeremy L Thompson 1789*0183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 1790*0183ed61SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 1791*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 1792*0183ed61SJeremy L Thompson if (basis_i == basis_j) { 1793*0183ed61SJeremy L Thompson output_matrix_reuse[i].index = j; 1794*0183ed61SJeremy L Thompson output_matrix_reuse[i].is_input = false; 1795*0183ed61SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 1796*0183ed61SJeremy L Thompson } 1797*0183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 1798*0183ed61SJeremy L Thompson } 1799*0183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 1800*0183ed61SJeremy L Thompson } 1801*0183ed61SJeremy L Thompson 1802*0183ed61SJeremy L Thompson // Initialize constants, and matrices B and G 1803*0183ed61SJeremy L Thompson code << "\n" << tab << "// Input field constants and basis data\n"; 1804*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1805*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 1806*0183ed61SJeremy L Thompson max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 1807*0183ed61SJeremy L Thompson } 1808*0183ed61SJeremy L Thompson code << "\n" << tab << "// Output field constants and basis data\n"; 1809*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 1810*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 1811*0183ed61SJeremy L Thompson max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices)); 1812*0183ed61SJeremy L Thompson } 1813*0183ed61SJeremy L Thompson 1814*0183ed61SJeremy L Thompson // Loop over all elements 1815*0183ed61SJeremy L Thompson code << "\n" << tab << "// Element loop\n"; 1816*0183ed61SJeremy L Thompson code << tab << "__syncthreads();\n"; 1817*0183ed61SJeremy L Thompson code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 1818*0183ed61SJeremy L Thompson tab.push(); 1819*0183ed61SJeremy L Thompson 1820*0183ed61SJeremy L Thompson // -- Compute minimum buffer space needed 1821*0183ed61SJeremy L Thompson CeedInt max_rstr_buffer_size = 1; 1822*0183ed61SJeremy L Thompson 1823*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1824*0183ed61SJeremy L Thompson CeedEvalMode eval_mode; 1825*0183ed61SJeremy L Thompson 1826*0183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1827*0183ed61SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 1828*0183ed61SJeremy L Thompson CeedInt num_comp; 1829*0183ed61SJeremy L Thompson CeedElemRestriction elem_rstr; 1830*0183ed61SJeremy L Thompson 1831*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 1832*0183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1833*0183ed61SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1834*0183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1835*0183ed61SJeremy L Thompson } 1836*0183ed61SJeremy L Thompson } 1837*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 1838*0183ed61SJeremy L Thompson CeedEvalMode eval_mode; 1839*0183ed61SJeremy L Thompson 1840*0183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1841*0183ed61SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE) { 1842*0183ed61SJeremy L Thompson CeedInt num_comp; 1843*0183ed61SJeremy L Thompson CeedElemRestriction elem_rstr; 1844*0183ed61SJeremy L Thompson 1845*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1846*0183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1847*0183ed61SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1848*0183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1849*0183ed61SJeremy L Thompson } 1850*0183ed61SJeremy L Thompson } 1851*0183ed61SJeremy L Thompson code << tab << "// Scratch restriction buffer space\n"; 1852*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 1853*0183ed61SJeremy L Thompson 1854*0183ed61SJeremy L Thompson // -- Determine best input field processing order 1855*0183ed61SJeremy L Thompson CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 1856*0183ed61SJeremy L Thompson 1857*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1858*0183ed61SJeremy L Thompson field_rstr_in_buffer[i] = -1; 1859*0183ed61SJeremy L Thompson input_field_order[i] = -1; 1860*0183ed61SJeremy L Thompson } 1861*0183ed61SJeremy L Thompson { 1862*0183ed61SJeremy L Thompson bool is_ordered[CEED_FIELD_MAX]; 1863*0183ed61SJeremy L Thompson CeedInt curr_index = 0; 1864*0183ed61SJeremy L Thompson 1865*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 1866*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1867*0183ed61SJeremy L Thompson CeedVector vec_i; 1868*0183ed61SJeremy L Thompson CeedElemRestriction rstr_i; 1869*0183ed61SJeremy L Thompson 1870*0183ed61SJeremy L Thompson if (is_ordered[i]) continue; 1871*0183ed61SJeremy L Thompson field_rstr_in_buffer[i] = i; 1872*0183ed61SJeremy L Thompson is_ordered[i] = true; 1873*0183ed61SJeremy L Thompson input_field_order[curr_index] = i; 1874*0183ed61SJeremy L Thompson curr_index++; 1875*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 1876*0183ed61SJeremy L Thompson if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 1877*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 1878*0183ed61SJeremy L Thompson for (CeedInt j = i + 1; j < num_input_fields; j++) { 1879*0183ed61SJeremy L Thompson CeedVector vec_j; 1880*0183ed61SJeremy L Thompson CeedElemRestriction rstr_j; 1881*0183ed61SJeremy L Thompson 1882*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 1883*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 1884*0183ed61SJeremy L Thompson if (rstr_i == rstr_j && vec_i == vec_j) { 1885*0183ed61SJeremy L Thompson field_rstr_in_buffer[j] = i; 1886*0183ed61SJeremy L Thompson is_ordered[j] = true; 1887*0183ed61SJeremy L Thompson input_field_order[curr_index] = j; 1888*0183ed61SJeremy L Thompson curr_index++; 1889*0183ed61SJeremy L Thompson } 1890*0183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_j)); 1891*0183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 1892*0183ed61SJeremy L Thompson } 1893*0183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_i)); 1894*0183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 1895*0183ed61SJeremy L Thompson } 1896*0183ed61SJeremy L Thompson } 1897*0183ed61SJeremy L Thompson 1898*0183ed61SJeremy L Thompson // -- Input restriction and basis 1899*0183ed61SJeremy L Thompson code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 1900*0183ed61SJeremy L Thompson CeedInt active_field_index = -1; 1901*0183ed61SJeremy L Thompson 1902*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1903*0183ed61SJeremy L Thompson bool is_active = false; 1904*0183ed61SJeremy L Thompson const char *field_name; 1905*0183ed61SJeremy L Thompson const CeedInt f = input_field_order[i]; 1906*0183ed61SJeremy L Thompson 1907*0183ed61SJeremy L Thompson { 1908*0183ed61SJeremy L Thompson CeedVector vec; 1909*0183ed61SJeremy L Thompson 1910*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 1911*0183ed61SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 1912*0183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 1913*0183ed61SJeremy L Thompson } 1914*0183ed61SJeremy L Thompson 1915*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 1916*0183ed61SJeremy L Thompson code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 1917*0183ed61SJeremy L Thompson 1918*0183ed61SJeremy L Thompson if (is_active) { 1919*0183ed61SJeremy L Thompson std::string var_suffix = "_in_" + std::to_string(f); 1920*0183ed61SJeremy L Thompson 1921*0183ed61SJeremy L Thompson code << tab << "// Active field - no restriction or basis action here\n"; 1922*0183ed61SJeremy L Thompson if (active_field_index == -1) { 1923*0183ed61SJeremy L Thompson active_field_index = f; 1924*0183ed61SJeremy L Thompson code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1") 1925*0183ed61SJeremy L Thompson << "] = {0.0};\n"; 1926*0183ed61SJeremy L Thompson } else { 1927*0183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n"; 1928*0183ed61SJeremy L Thompson } 1929*0183ed61SJeremy L Thompson } else { 1930*0183ed61SJeremy L Thompson // ---- Restriction 1931*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 1932*0183ed61SJeremy L Thompson max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 1933*0183ed61SJeremy L Thompson 1934*0183ed61SJeremy L Thompson // ---- Basis action 1935*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 1936*0183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 1937*0183ed61SJeremy L Thompson } 1938*0183ed61SJeremy L Thompson } 1939*0183ed61SJeremy L Thompson 1940*0183ed61SJeremy L Thompson // -- Loop over active field 1941*0183ed61SJeremy L Thompson std::string active_var_suffix = "_in_" + std::to_string(active_field_index); 1942*0183ed61SJeremy L Thompson 1943*0183ed61SJeremy L Thompson code << "\n" << tab << "// Loop over nodes in active field\n"; 1944*0183ed61SJeremy L Thompson code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix 1945*0183ed61SJeremy L Thompson << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n"; 1946*0183ed61SJeremy L Thompson tab.push(); 1947*0183ed61SJeremy L Thompson 1948*0183ed61SJeremy L Thompson // -- Set current active node and component to 1 1949*0183ed61SJeremy L Thompson code << tab << "// Set current active node and component to 1.0\n"; 1950*0183ed61SJeremy L Thompson code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e" 1951*0183ed61SJeremy L Thompson << active_var_suffix << ");\n\n"; 1952*0183ed61SJeremy L Thompson 1953*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1954*0183ed61SJeremy L Thompson bool is_active = false; 1955*0183ed61SJeremy L Thompson const char *field_name; 1956*0183ed61SJeremy L Thompson const CeedInt f = input_field_order[i]; 1957*0183ed61SJeremy L Thompson 1958*0183ed61SJeremy L Thompson { 1959*0183ed61SJeremy L Thompson CeedVector vec; 1960*0183ed61SJeremy L Thompson 1961*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 1962*0183ed61SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 1963*0183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 1964*0183ed61SJeremy L Thompson } 1965*0183ed61SJeremy L Thompson if (!is_active) continue; 1966*0183ed61SJeremy L Thompson 1967*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 1968*0183ed61SJeremy L Thompson code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 1969*0183ed61SJeremy L Thompson 1970*0183ed61SJeremy L Thompson // ---- Basis action 1971*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 1972*0183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 1973*0183ed61SJeremy L Thompson } 1974*0183ed61SJeremy L Thompson 1975*0183ed61SJeremy L Thompson // -- Q function 1976*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 1977*0183ed61SJeremy L Thompson qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 1978*0183ed61SJeremy L Thompson Q_1d, is_all_tensor, is_at_points, use_3d_slices)); 1979*0183ed61SJeremy L Thompson 1980*0183ed61SJeremy L Thompson // -- Output basis and restriction 1981*0183ed61SJeremy L Thompson code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 1982*0183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 1983*0183ed61SJeremy L Thompson bool is_active = false; 1984*0183ed61SJeremy L Thompson const char *field_name; 1985*0183ed61SJeremy L Thompson 1986*0183ed61SJeremy L Thompson { 1987*0183ed61SJeremy L Thompson CeedVector vec; 1988*0183ed61SJeremy L Thompson 1989*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 1990*0183ed61SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 1991*0183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 1992*0183ed61SJeremy L Thompson } 1993*0183ed61SJeremy L Thompson if (!is_active) continue; 1994*0183ed61SJeremy L Thompson 1995*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 1996*0183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1997*0183ed61SJeremy L Thompson 1998*0183ed61SJeremy L Thompson // ---- Basis action 1999*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, 2000*0183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 2001*0183ed61SJeremy L Thompson 2002*0183ed61SJeremy L Thompson // ---- Restriction 2003*0183ed61SJeremy L Thompson if (is_full) { 2004*0183ed61SJeremy L Thompson // TODO: UPDATE OUTPUTS FOR FULL ASSEMBLY 2005*0183ed61SJeremy L Thompson } else { 2006*0183ed61SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 2007*0183ed61SJeremy L Thompson CeedInt comp_stride; 2008*0183ed61SJeremy L Thompson CeedSize l_size; 2009*0183ed61SJeremy L Thompson CeedElemRestriction elem_rstr; 2010*0183ed61SJeremy L Thompson 2011*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 2012*0183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 2013*0183ed61SJeremy L Thompson code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 2014*0183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 2015*0183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 2016*0183ed61SJeremy L Thompson code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix 2017*0183ed61SJeremy L Thompson << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n"; 2018*0183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2019*0183ed61SJeremy L Thompson } 2020*0183ed61SJeremy L Thompson } 2021*0183ed61SJeremy L Thompson 2022*0183ed61SJeremy L Thompson // -- Reset current active node and component 2023*0183ed61SJeremy L Thompson code << "\n" << tab << "// Reset current active node and component to 0.0\n"; 2024*0183ed61SJeremy L Thompson code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e" 2025*0183ed61SJeremy L Thompson << active_var_suffix << ");\n"; 2026*0183ed61SJeremy L Thompson 2027*0183ed61SJeremy L Thompson // -- End of loop over active field 2028*0183ed61SJeremy L Thompson tab.pop(); 2029*0183ed61SJeremy L Thompson code << tab << "}\n"; 2030*0183ed61SJeremy L Thompson 2031*0183ed61SJeremy L Thompson // Close loop and function 2032*0183ed61SJeremy L Thompson tab.pop(); 2033*0183ed61SJeremy L Thompson code << tab << "}\n"; 2034*0183ed61SJeremy L Thompson tab.pop(); 2035*0183ed61SJeremy L Thompson code << tab << "}\n"; 2036*0183ed61SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n\n"; 2037*0183ed61SJeremy L Thompson 2038*0183ed61SJeremy L Thompson CeedInt block_sizes[3] = {0, 0, 0}; 2039*0183ed61SJeremy L Thompson CeedInt num_elem; 2040*0183ed61SJeremy L Thompson 2041*0183ed61SJeremy L Thompson // Compile 2042*0183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); 2043*0183ed61SJeremy L Thompson CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes)); 2044*0183ed61SJeremy L Thompson block_sizes[2] = 1; 2045*0183ed61SJeremy L Thompson { 2046*0183ed61SJeremy L Thompson bool is_compile_good = false; 2047*0183ed61SJeremy L Thompson 2048*0183ed61SJeremy L Thompson data->thread_1d = block_sizes[0]; 2049*0183ed61SJeremy L Thompson CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, 2050*0183ed61SJeremy L Thompson is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0], 2051*0183ed61SJeremy L Thompson "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2])); 2052*0183ed61SJeremy L Thompson if (is_compile_good) { 2053*0183ed61SJeremy L Thompson *is_good_build = true; 2054*0183ed61SJeremy L Thompson CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(), 2055*0183ed61SJeremy L Thompson is_full ? &data->assemble_full : &data->assemble_diagonal)); 2056*0183ed61SJeremy L Thompson } else { 2057*0183ed61SJeremy L Thompson *is_good_build = false; 2058*0183ed61SJeremy L Thompson data->use_assembly_fallback = true; 2059*0183ed61SJeremy L Thompson } 2060*0183ed61SJeremy L Thompson } 2061*0183ed61SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 2062*0183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionDestroy(&qf)); 2063*0183ed61SJeremy L Thompson return CEED_ERROR_SUCCESS; 2064*0183ed61SJeremy L Thompson } 2065*0183ed61SJeremy L Thompson 2066*0183ed61SJeremy L Thompson extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) { 2067*0183ed61SJeremy L Thompson return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build); 2068*0183ed61SJeremy L Thompson } 2069*0183ed61SJeremy L Thompson 2070*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 2071