19ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3241a4b83SYohann // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5241a4b83SYohann // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 73d576824SJeremy L Thompson 8b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12 97df94212SJeremy L Thompson 1049aac155SJeremy L Thompson #include <ceed.h> 11ec3da8bcSJed Brown #include <ceed/backend.h> 120183ed61SJeremy L Thompson #include <ceed/gen-tools.h> 139e201c85SYohann #include <ceed/jit-tools.h> 143d576824SJeremy L Thompson #include <cuda_runtime.h> 152b730f8bSJeremy L Thompson 16241a4b83SYohann #include <iostream> 17241a4b83SYohann #include <sstream> 18b2165e7aSSebastian Grimberg #include <string> 192b730f8bSJeremy L Thompson 200d0321e0SJeremy L Thompson #include "../cuda-ref/ceed-cuda-ref.h" 21241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 2249aac155SJeremy L Thompson #include "../cuda/ceed-cuda-common.h" 232b730f8bSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 242b730f8bSJeremy L Thompson #include "ceed-cuda-gen.h" 25241a4b83SYohann 2645a787f7SJeremy L Thompson struct FieldReuse_Cuda { 2745a787f7SJeremy L Thompson CeedInt index; 2845a787f7SJeremy L Thompson bool is_input; 2945a787f7SJeremy L Thompson CeedEvalMode eval_mode; 3045a787f7SJeremy L Thompson }; 3145a787f7SJeremy L Thompson 32ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 334b3e95d5SJeremy L Thompson // Determine type of operator 344b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 354b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 364b3e95d5SJeremy L Thompson CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields, 37412e5683SJeremy L Thompson CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d, 38412e5683SJeremy L Thompson CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) { 39412e5683SJeremy L Thompson // Check if all are tensor 40412e5683SJeremy L Thompson *is_all_tensor = true; 414b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 424b3e95d5SJeremy L Thompson CeedBasis basis; 434b3e95d5SJeremy L Thompson 444b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 454b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 464b3e95d5SJeremy L Thompson bool is_field_tensor; 474b3e95d5SJeremy L Thompson 484b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 49412e5683SJeremy L Thompson *is_all_tensor = *is_all_tensor && is_field_tensor; 504b3e95d5SJeremy L Thompson } 51681d0ea7SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 524b3e95d5SJeremy L Thompson } 534b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 544b3e95d5SJeremy L Thompson CeedBasis basis; 554b3e95d5SJeremy L Thompson 564b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 574b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 584b3e95d5SJeremy L Thompson bool is_field_tensor; 594b3e95d5SJeremy L Thompson 604b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 61412e5683SJeremy L Thompson *is_all_tensor = *is_all_tensor && is_field_tensor; 62412e5683SJeremy L Thompson } 63412e5683SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 64412e5683SJeremy L Thompson } 65412e5683SJeremy L Thompson 66412e5683SJeremy L Thompson // Find max_P, max_P_1d, Q, and Q_1d 67412e5683SJeremy L Thompson bool is_all_3d = true; 68412e5683SJeremy L Thompson 69412e5683SJeremy L Thompson *max_P = 0; 70412e5683SJeremy L Thompson *max_P_1d = 0; 71412e5683SJeremy L Thompson *Q = 0; 72412e5683SJeremy L Thompson *Q_1d = 0; 73412e5683SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 74412e5683SJeremy L Thompson CeedBasis basis; 75412e5683SJeremy L Thompson 76412e5683SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 77412e5683SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 78412e5683SJeremy L Thompson bool is_field_tensor; 79412e5683SJeremy L Thompson CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; 80412e5683SJeremy L Thompson 81412e5683SJeremy L Thompson // Check if 3D 824b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 83412e5683SJeremy L Thompson is_all_3d = is_all_3d && (field_dim == 3); 84412e5683SJeremy L Thompson *max_dim = CeedIntMax(*max_dim, field_dim); 85412e5683SJeremy L Thompson 86412e5683SJeremy L Thompson // Collect P, P_1d, Q, and Q_1d 87412e5683SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); 88412e5683SJeremy L Thompson *max_P = CeedIntMax(*max_P, field_P); 89412e5683SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 90412e5683SJeremy L Thompson if (is_field_tensor) { 91412e5683SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 92412e5683SJeremy L Thompson *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 93412e5683SJeremy L Thompson } 94412e5683SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); 95412e5683SJeremy L Thompson CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 96412e5683SJeremy L Thompson *Q = field_Q; 97412e5683SJeremy L Thompson if (is_field_tensor) { 98412e5683SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 994b3e95d5SJeremy L Thompson CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 1004b3e95d5SJeremy L Thompson *Q_1d = field_Q_1d; 1014b3e95d5SJeremy L Thompson } 102412e5683SJeremy L Thompson } 103412e5683SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 104412e5683SJeremy L Thompson } 105412e5683SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 106412e5683SJeremy L Thompson CeedBasis basis; 107412e5683SJeremy L Thompson 108412e5683SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 109412e5683SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 110412e5683SJeremy L Thompson bool is_field_tensor; 111412e5683SJeremy L Thompson CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0; 112412e5683SJeremy L Thompson 113412e5683SJeremy L Thompson // Check if 3D 114412e5683SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 115412e5683SJeremy L Thompson is_all_3d = is_all_3d && (field_dim == 3); 116412e5683SJeremy L Thompson *max_dim = CeedIntMax(*max_dim, field_dim); 117412e5683SJeremy L Thompson 118412e5683SJeremy L Thompson // Collect P, P_1d, Q, and Q_1d 119412e5683SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P)); 120412e5683SJeremy L Thompson *max_P = CeedIntMax(*max_P, field_P); 121412e5683SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 122412e5683SJeremy L Thompson if (is_field_tensor) { 123412e5683SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 124412e5683SJeremy L Thompson *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 125412e5683SJeremy L Thompson } 126412e5683SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q)); 127412e5683SJeremy L Thompson CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 128412e5683SJeremy L Thompson *Q = field_Q; 129412e5683SJeremy L Thompson if (is_field_tensor) { 130412e5683SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 131412e5683SJeremy L Thompson CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 132412e5683SJeremy L Thompson *Q_1d = field_Q_1d; 133412e5683SJeremy L Thompson } 134412e5683SJeremy L Thompson } 135681d0ea7SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 1364b3e95d5SJeremy L Thompson } 1374b3e95d5SJeremy L Thompson 1384b3e95d5SJeremy L Thompson // Only use 3D collocated gradient parallelization strategy when gradient is computed 1394b3e95d5SJeremy L Thompson *use_3d_slices = false; 140412e5683SJeremy L Thompson if (is_all_3d && *is_all_tensor) { 1414b3e95d5SJeremy L Thompson bool was_grad_found = false; 1424b3e95d5SJeremy L Thompson 1434b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1444b3e95d5SJeremy L Thompson CeedEvalMode eval_mode; 1454b3e95d5SJeremy L Thompson 1464b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 1474b3e95d5SJeremy L Thompson if (eval_mode == CEED_EVAL_GRAD) { 1484b3e95d5SJeremy L Thompson CeedBasis_Cuda_shared *basis_data; 1494b3e95d5SJeremy L Thompson CeedBasis basis; 1504b3e95d5SJeremy L Thompson 1514b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 1524b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 1534b3e95d5SJeremy L Thompson *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 1544b3e95d5SJeremy L Thompson was_grad_found = true; 155681d0ea7SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 1564b3e95d5SJeremy L Thompson } 1574b3e95d5SJeremy L Thompson } 1584b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 1594b3e95d5SJeremy L Thompson CeedEvalMode eval_mode; 1604b3e95d5SJeremy L Thompson 1614b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1624b3e95d5SJeremy L Thompson if (eval_mode == CEED_EVAL_GRAD) { 1634b3e95d5SJeremy L Thompson CeedBasis_Cuda_shared *basis_data; 1644b3e95d5SJeremy L Thompson CeedBasis basis; 1654b3e95d5SJeremy L Thompson 1664b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 1674b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 1684b3e95d5SJeremy L Thompson *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 1694b3e95d5SJeremy L Thompson was_grad_found = true; 170681d0ea7SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 1714b3e95d5SJeremy L Thompson } 1724b3e95d5SJeremy L Thompson } 1734b3e95d5SJeremy L Thompson } 1744b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 1754b3e95d5SJeremy L Thompson } 1764b3e95d5SJeremy L Thompson 1774b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 1784b3e95d5SJeremy L Thompson // Setup fields 1794b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 1800183ed61SJeremy L Thompson static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i, 1810183ed61SJeremy L Thompson CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Cuda field_reuse, 1820183ed61SJeremy L Thompson CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, 183ca1da9b9SJeremy L Thompson bool use_3d_slices, bool skip_active_load) { 184ca1da9b9SJeremy L Thompson bool is_tensor = true, is_active = true; 185412e5683SJeremy L Thompson CeedBasis basis; 186ca1da9b9SJeremy L Thompson 187412e5683SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 188412e5683SJeremy L Thompson if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 189ca1da9b9SJeremy L Thompson { 190ca1da9b9SJeremy L Thompson CeedVector vec; 191ca1da9b9SJeremy L Thompson 192ca1da9b9SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec)); 193ca1da9b9SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 194ca1da9b9SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 195ca1da9b9SJeremy L Thompson } 196412e5683SJeremy L Thompson 19759fa3f92SJeremy L Thompson const char *field_name; 1984b3e95d5SJeremy L Thompson std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 199dc007f05SJeremy L Thompson std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 2004b3e95d5SJeremy L Thompson std::string option_name = (is_input ? "inputs" : "outputs"); 2014b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 2028014c5e7SJeremy L Thompson CeedInt elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0; 2034b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 2044b3e95d5SJeremy L Thompson CeedBasis_Cuda_shared *basis_data; 2054b3e95d5SJeremy L Thompson 2060a2a6492SJeremy L Thompson // Field reuse info 20745a787f7SJeremy L Thompson bool use_previous_field = field_reuse.index != -1; 2080a2a6492SJeremy L Thompson 20959fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name)); 2100183ed61SJeremy L Thompson code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n"; 2114b3e95d5SJeremy L Thompson 2124b3e95d5SJeremy L Thompson // Get field data 2134b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 2144b3e95d5SJeremy L Thompson if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 2154b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 2164b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 2174b3e95d5SJeremy L Thompson } 218681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 2194b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 2204b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 221412e5683SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 222dc007f05SJeremy L Thompson if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 223dc007f05SJeremy L Thompson else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 2244b3e95d5SJeremy L Thompson } 2254b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 2264b3e95d5SJeremy L Thompson 2274b3e95d5SJeremy L Thompson // Set field constants 2280183ed61SJeremy L Thompson code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n"; 229412e5683SJeremy L Thompson if (is_tensor && !is_all_tensor) { 230412e5683SJeremy L Thompson CeedInt P = 0; 231412e5683SJeremy L Thompson 232412e5683SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes(basis, &P)); 2330183ed61SJeremy L Thompson code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n"; 234412e5683SJeremy L Thompson } 2350183ed61SJeremy L Thompson code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n"; 236343e3094SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 2370183ed61SJeremy L Thompson code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; 2384b3e95d5SJeremy L Thompson } 2394b3e95d5SJeremy L Thompson 2404b3e95d5SJeremy L Thompson // Load basis data 2410183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 2424b3e95d5SJeremy L Thompson switch (eval_mode) { 2434b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 2444b3e95d5SJeremy L Thompson break; 2454b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 2468b97b69aSJeremy L Thompson if (is_at_points) { 2478b97b69aSJeremy L Thompson // AtPoints 2488b97b69aSJeremy L Thompson if (!basis_data->d_chebyshev_interp_1d) { 2498b97b69aSJeremy L Thompson CeedSize interp_bytes; 2508b97b69aSJeremy L Thompson CeedScalar *chebyshev_interp_1d; 2518b97b69aSJeremy L Thompson 2528b97b69aSJeremy L Thompson interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 2538b97b69aSJeremy L Thompson CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 2548b97b69aSJeremy L Thompson CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 2558b97b69aSJeremy L Thompson CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 2568b97b69aSJeremy L Thompson CeedCallCuda(CeedBasisReturnCeed(basis), 2578b97b69aSJeremy L Thompson cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 2588b97b69aSJeremy L Thompson CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 2598b97b69aSJeremy L Thompson } 2608b97b69aSJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 2618b97b69aSJeremy L Thompson else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 2628b97b69aSJeremy L Thompson } else { 2638b97b69aSJeremy L Thompson // Standard quadrature 2644b3e95d5SJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 2654b3e95d5SJeremy L Thompson else data->B.outputs[i] = basis_data->d_interp_1d; 2668b97b69aSJeremy L Thompson } 267ca1da9b9SJeremy L Thompson if (use_previous_field && !skip_active_load) { 26845a787f7SJeremy L Thompson std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 2690a2a6492SJeremy L Thompson 2700183ed61SJeremy L Thompson code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; 2710a2a6492SJeremy L Thompson } else { 2720ccda8ebSJeremy L Thompson bool is_collocated = false; 2730ccda8ebSJeremy L Thompson 2740ccda8ebSJeremy L Thompson CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 275ca1da9b9SJeremy L Thompson if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { 2760ccda8ebSJeremy L Thompson code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; 2770ccda8ebSJeremy L Thompson } else { 2780183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 2790183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 2800a2a6492SJeremy L Thompson } 2810ccda8ebSJeremy L Thompson } 2824b3e95d5SJeremy L Thompson break; 2834b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 2848b97b69aSJeremy L Thompson if (is_at_points) { 2858b97b69aSJeremy L Thompson // AtPoints 2868b97b69aSJeremy L Thompson if (!basis_data->d_chebyshev_interp_1d) { 2878b97b69aSJeremy L Thompson CeedSize interp_bytes; 2888b97b69aSJeremy L Thompson CeedScalar *chebyshev_interp_1d; 2898b97b69aSJeremy L Thompson 2908b97b69aSJeremy L Thompson interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 2918b97b69aSJeremy L Thompson CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 2928b97b69aSJeremy L Thompson CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 2938b97b69aSJeremy L Thompson CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 2948b97b69aSJeremy L Thompson CeedCallCuda(CeedBasisReturnCeed(basis), 2958b97b69aSJeremy L Thompson cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 2968b97b69aSJeremy L Thompson CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 2978b97b69aSJeremy L Thompson } 2988b97b69aSJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 2998b97b69aSJeremy L Thompson else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 3008b97b69aSJeremy L Thompson } else { 3018b97b69aSJeremy L Thompson // Standard quadrature 3024b3e95d5SJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 3034b3e95d5SJeremy L Thompson else data->B.outputs[i] = basis_data->d_interp_1d; 3048b97b69aSJeremy L Thompson } 305dc007f05SJeremy L Thompson if (is_tensor) { 306ca1da9b9SJeremy L Thompson if (use_previous_field && !skip_active_load) { 30745a787f7SJeremy L Thompson std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 3080a2a6492SJeremy L Thompson 3090183ed61SJeremy L Thompson code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; 3100a2a6492SJeremy L Thompson } else { 3110ccda8ebSJeremy L Thompson bool is_collocated = false; 3120ccda8ebSJeremy L Thompson 3130ccda8ebSJeremy L Thompson CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 314ca1da9b9SJeremy L Thompson if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) { 3150ccda8ebSJeremy L Thompson code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n"; 3160ccda8ebSJeremy L Thompson } else { 3170183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; 3180183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 319dc007f05SJeremy L Thompson } 3200a2a6492SJeremy L Thompson } 3210ccda8ebSJeremy L Thompson } 3228b97b69aSJeremy L Thompson if (is_at_points) break; // No G mat for AtPoints 3234b3e95d5SJeremy L Thompson if (use_3d_slices) { 3244b3e95d5SJeremy L Thompson if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d; 3254b3e95d5SJeremy L Thompson else data->G.outputs[i] = basis_data->d_collo_grad_1d; 326ca1da9b9SJeremy L Thompson if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 32745a787f7SJeremy L Thompson std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 3280a2a6492SJeremy L Thompson 3290183ed61SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 330ca1da9b9SJeremy L Thompson } else if (is_active && skip_active_load) { 331ca1da9b9SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 3320a2a6492SJeremy L Thompson } else { 3330183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 3340183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 3350a2a6492SJeremy L Thompson } 3364b3e95d5SJeremy L Thompson } else { 3374b3e95d5SJeremy L Thompson bool has_collo_grad = basis_data->d_collo_grad_1d; 3384b3e95d5SJeremy L Thompson 3394b3e95d5SJeremy L Thompson if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 3404b3e95d5SJeremy L Thompson else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 3414b3e95d5SJeremy L Thompson if (has_collo_grad) { 342ca1da9b9SJeremy L Thompson if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 34345a787f7SJeremy L Thompson std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 3440a2a6492SJeremy L Thompson 3450183ed61SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 346ca1da9b9SJeremy L Thompson } else if (is_active && skip_active_load) { 347ca1da9b9SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 3480a2a6492SJeremy L Thompson } else { 3490183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; 3500183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 3510a2a6492SJeremy L Thompson } 3520a2a6492SJeremy L Thompson } else { 353ca1da9b9SJeremy L Thompson if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) { 35445a787f7SJeremy L Thompson std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index)); 3550a2a6492SJeremy L Thompson 3560183ed61SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; 357ca1da9b9SJeremy L Thompson } else if (is_active && skip_active_load) { 358ca1da9b9SJeremy L Thompson code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n"; 3594b3e95d5SJeremy L Thompson } else { 3600183ed61SJeremy L Thompson code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") 361412e5683SJeremy L Thompson << (is_tensor ? "" : var_suffix) << "];\n"; 3620183ed61SJeremy L Thompson code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G." 363412e5683SJeremy L Thompson << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 3644b3e95d5SJeremy L Thompson } 3654b3e95d5SJeremy L Thompson } 3660a2a6492SJeremy L Thompson } 3674b3e95d5SJeremy L Thompson break; 3684b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 3694b3e95d5SJeremy L Thompson break; // No action 3704b3e95d5SJeremy L Thompson // LCOV_EXCL_START 3714b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 3724b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 3734b3e95d5SJeremy L Thompson break; // TODO: Not implemented 3744b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 3754b3e95d5SJeremy L Thompson } 3768b97b69aSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 3774b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 3784b3e95d5SJeremy L Thompson } 3794b3e95d5SJeremy L Thompson 3804b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 3814b3e95d5SJeremy L Thompson // Restriction 3824b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 3830183ed61SJeremy L Thompson static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i, 3840183ed61SJeremy L Thompson CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, 3850183ed61SJeremy L Thompson CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points, 3860183ed61SJeremy L Thompson bool use_3d_slices) { 3874b3e95d5SJeremy L Thompson std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 388412e5683SJeremy L Thompson std::string P_name = (is_all_tensor ? "P_1d" : "P") + var_suffix; 3894b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 390412e5683SJeremy L Thompson CeedInt elem_size = 0, num_comp = 0; 3914b3e95d5SJeremy L Thompson CeedSize l_size; 392f815fac9SJeremy L Thompson CeedRestrictionType rstr_type = CEED_RESTRICTION_STANDARD; 3934b3e95d5SJeremy L Thompson CeedElemRestriction_Cuda *rstr_data; 3944b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 3954b3e95d5SJeremy L Thompson 3964b3e95d5SJeremy L Thompson // Get field data 3974b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 3984b3e95d5SJeremy L Thompson if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 399f815fac9SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); 4004b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 4014b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 4024b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 4034b3e95d5SJeremy L Thompson } 4044b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 4054b3e95d5SJeremy L Thompson 4064b3e95d5SJeremy L Thompson // Restriction 4074b3e95d5SJeremy L Thompson if (is_input) { 4084b3e95d5SJeremy L Thompson // Input 409e93651e5SJeremy L Thompson if (field_input_buffer[i] != i) { 410e93651e5SJeremy L Thompson std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]); 411e93651e5SJeremy L Thompson 412e93651e5SJeremy L Thompson // Restriction was already done for previous input 4130183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n"; 4148b97b69aSJeremy L Thompson } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) { 4158b97b69aSJeremy L Thompson if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) { 416e93651e5SJeremy L Thompson // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated 4170183ed61SJeremy L Thompson code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n"; 4188b97b69aSJeremy L Thompson } else if (rstr_type != CEED_RESTRICTION_POINTS) { 419e93651e5SJeremy L Thompson // Otherwise we're using the scratch space 4200183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 421e93651e5SJeremy L Thompson } 422f815fac9SJeremy L Thompson switch (rstr_type) { 423f815fac9SJeremy L Thompson case CEED_RESTRICTION_STANDARD: { 4244b3e95d5SJeremy L Thompson CeedInt comp_stride; 4254b3e95d5SJeremy L Thompson 4264b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 4270183ed61SJeremy L Thompson code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 4284b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 4290183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 4304b3e95d5SJeremy L Thompson data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 4310183ed61SJeremy L Thompson code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " 4320183ed61SJeremy L Thompson << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix 4330183ed61SJeremy L Thompson << ");\n"; 434f815fac9SJeremy L Thompson break; 435f815fac9SJeremy L Thompson } 436f815fac9SJeremy L Thompson case CEED_RESTRICTION_STRIDED: { 4374b3e95d5SJeremy L Thompson bool has_backend_strides; 4384b3e95d5SJeremy L Thompson CeedInt num_elem; 4394b3e95d5SJeremy L Thompson 4404b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 4414b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 4424b3e95d5SJeremy L Thompson CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 4434b3e95d5SJeremy L Thompson 4444b3e95d5SJeremy L Thompson if (!has_backend_strides) { 4454b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 4464b3e95d5SJeremy L Thompson } 4470183ed61SJeremy L Thompson code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 4480183ed61SJeremy L Thompson << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 4490183ed61SJeremy L Thompson code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides" 4500183ed61SJeremy L Thompson << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e" 4510183ed61SJeremy L Thompson << var_suffix << ");\n"; 452f815fac9SJeremy L Thompson break; 453f815fac9SJeremy L Thompson } 4548b97b69aSJeremy L Thompson case CEED_RESTRICTION_POINTS: { 4558b97b69aSJeremy L Thompson CeedInt comp_stride; 4568b97b69aSJeremy L Thompson 4578b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 4580183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 4598b97b69aSJeremy L Thompson data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 4608b97b69aSJeremy L Thompson break; 4618b97b69aSJeremy L Thompson } 462f815fac9SJeremy L Thompson // LCOV_EXCL_START 463f815fac9SJeremy L Thompson case CEED_RESTRICTION_ORIENTED: 464f815fac9SJeremy L Thompson case CEED_RESTRICTION_CURL_ORIENTED: 465f815fac9SJeremy L Thompson break; // TODO: Not implemented 466f815fac9SJeremy L Thompson // LCOV_EXCL_STOP 4674b3e95d5SJeremy L Thompson } 4684b3e95d5SJeremy L Thompson } 4694b3e95d5SJeremy L Thompson } else { 4704b3e95d5SJeremy L Thompson // Output 471f815fac9SJeremy L Thompson switch (rstr_type) { 472f815fac9SJeremy L Thompson case CEED_RESTRICTION_STANDARD: { 4734b3e95d5SJeremy L Thompson CeedInt comp_stride; 4744b3e95d5SJeremy L Thompson 4754b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 4760183ed61SJeremy L Thompson code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 4774b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 4780183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 4794b3e95d5SJeremy L Thompson data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 4800183ed61SJeremy L Thompson code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " 4810183ed61SJeremy L Thompson << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix 4820183ed61SJeremy L Thompson << ");\n"; 483f815fac9SJeremy L Thompson break; 484f815fac9SJeremy L Thompson } 485f815fac9SJeremy L Thompson case CEED_RESTRICTION_STRIDED: { 4864b3e95d5SJeremy L Thompson bool has_backend_strides; 4874b3e95d5SJeremy L Thompson CeedInt num_elem; 4884b3e95d5SJeremy L Thompson 4894b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 4904b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 4914b3e95d5SJeremy L Thompson CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 4924b3e95d5SJeremy L Thompson 4934b3e95d5SJeremy L Thompson if (!has_backend_strides) { 4944b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 4954b3e95d5SJeremy L Thompson } 4960183ed61SJeremy L Thompson code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 4970183ed61SJeremy L Thompson << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 4980183ed61SJeremy L Thompson code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides" 4990183ed61SJeremy L Thompson << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix 5000183ed61SJeremy L Thompson << ");\n"; 501f815fac9SJeremy L Thompson break; 502f815fac9SJeremy L Thompson } 5038b97b69aSJeremy L Thompson case CEED_RESTRICTION_POINTS: 5048b97b69aSJeremy L Thompson data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 5058b97b69aSJeremy L Thompson break; 506f815fac9SJeremy L Thompson // LCOV_EXCL_START 507f815fac9SJeremy L Thompson case CEED_RESTRICTION_ORIENTED: 508f815fac9SJeremy L Thompson case CEED_RESTRICTION_CURL_ORIENTED: 509f815fac9SJeremy L Thompson break; // TODO: Not implemented 510f815fac9SJeremy L Thompson // LCOV_EXCL_STOP 5114b3e95d5SJeremy L Thompson } 5124b3e95d5SJeremy L Thompson } 513681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 5144b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 5154b3e95d5SJeremy L Thompson } 5164b3e95d5SJeremy L Thompson 5174b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 5184b3e95d5SJeremy L Thompson // Basis 5194b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 5200183ed61SJeremy L Thompson static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt i, 5210183ed61SJeremy L Thompson CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, 5220183ed61SJeremy L Thompson bool is_input, bool is_all_tensor, bool is_at_points, bool use_3d_slices) { 5230ccda8ebSJeremy L Thompson bool is_tensor = true, is_collocated = true; 524412e5683SJeremy L Thompson CeedBasis basis; 525412e5683SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 526412e5683SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 5270ccda8ebSJeremy L Thompson CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated)); 528412e5683SJeremy L Thompson 5294b3e95d5SJeremy L Thompson std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 530dc007f05SJeremy L Thompson std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; 5314b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 532412e5683SJeremy L Thompson CeedInt dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0; 5334b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 5344b3e95d5SJeremy L Thompson 5354b3e95d5SJeremy L Thompson // Get field data 5364b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 5374b3e95d5SJeremy L Thompson if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 5384b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 5394b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 5404b3e95d5SJeremy L Thompson } 541681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 5424b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 543412e5683SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &dim)); 544dc007f05SJeremy L Thompson if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 545dc007f05SJeremy L Thompson else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d)); 5464b3e95d5SJeremy L Thompson } 5474b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 5484b3e95d5SJeremy L Thompson 5494b3e95d5SJeremy L Thompson // Basis 5500183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 5514b3e95d5SJeremy L Thompson if (is_input) { 5524b3e95d5SJeremy L Thompson switch (eval_mode) { 5534b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 5548b97b69aSJeremy L Thompson if (!use_3d_slices && !is_at_points) { 5550183ed61SJeremy L Thompson code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n"; 5564b3e95d5SJeremy L Thompson } 5574b3e95d5SJeremy L Thompson break; 5584b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 5598b97b69aSJeremy L Thompson if (is_at_points) { 560dc007f05SJeremy L Thompson std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 561dc007f05SJeremy L Thompson 5620183ed61SJeremy L Thompson code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 5630183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 56499421279SJeremy L Thompson << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 5658b97b69aSJeremy L Thompson } else { 5660ccda8ebSJeremy L Thompson std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 5670ccda8ebSJeremy L Thompson std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 568412e5683SJeremy L Thompson : "InterpNonTensor"; 569c433aabcSJeremy L Thompson std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 570dc007f05SJeremy L Thompson 5710183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 5720183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e" 573c433aabcSJeremy L Thompson << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 5748b97b69aSJeremy L Thompson } 5754b3e95d5SJeremy L Thompson break; 5764b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 5778b97b69aSJeremy L Thompson if (is_at_points) { 578dc007f05SJeremy L Thompson std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d"; 579dc007f05SJeremy L Thompson 5800183ed61SJeremy L Thompson code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n"; 5810183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 58299421279SJeremy L Thompson << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 5838b97b69aSJeremy L Thompson } else if (use_3d_slices) { 5840ccda8ebSJeremy L Thompson std::string function_name = 5850ccda8ebSJeremy L Thompson (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d"; 586dc007f05SJeremy L Thompson 5870183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 5880183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix 58999421279SJeremy L Thompson << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 590dc007f05SJeremy L Thompson } else if (is_tensor) { 5910ccda8ebSJeremy L Thompson bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; 5920ccda8ebSJeremy L Thompson std::string function_name = 5937d878d16SJeremy L Thompson (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + 5947d878d16SJeremy L Thompson std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); 595c433aabcSJeremy L Thompson std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 596dc007f05SJeremy L Thompson 5970183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" 598259057edSJeremy L Thompson << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n"; 5990183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e" 600c433aabcSJeremy L Thompson << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 6014b3e95d5SJeremy L Thompson } else { 602dc007f05SJeremy L Thompson std::string function_name = "GradNonTensor"; 603dc007f05SJeremy L Thompson 6040183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 6050183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name 606c433aabcSJeremy L Thompson << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n"; 6074b3e95d5SJeremy L Thompson } 6084b3e95d5SJeremy L Thompson break; 6094b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: { 6108b97b69aSJeremy L Thompson if (is_at_points) { 6110183ed61SJeremy L Thompson code << tab << "// Nothing to do AtPoints\n"; 6128b97b69aSJeremy L Thompson } else { 6134b3e95d5SJeremy L Thompson CeedBasis_Cuda_shared *basis_data; 614412e5683SJeremy L Thompson std::string function_name = is_tensor 615412e5683SJeremy L Thompson ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 616412e5683SJeremy L Thompson : "WeightNonTensor"; 6174b3e95d5SJeremy L Thompson 6180183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n"; 6194b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 6204b3e95d5SJeremy L Thompson data->W = basis_data->d_q_weight_1d; 6210183ed61SJeremy L Thompson code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n"; 6228b97b69aSJeremy L Thompson } 6234b3e95d5SJeremy L Thompson break; 6244b3e95d5SJeremy L Thompson } 6254b3e95d5SJeremy L Thompson // LCOV_EXCL_START 6264b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 6274b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 6284b3e95d5SJeremy L Thompson break; // TODO: Not implemented 6294b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 6304b3e95d5SJeremy L Thompson } 6314b3e95d5SJeremy L Thompson } else { 6324b3e95d5SJeremy L Thompson switch (eval_mode) { 6334b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 6340183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n"; 6354b3e95d5SJeremy L Thompson break; // No action 6364b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 6370183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 6388b97b69aSJeremy L Thompson if (is_at_points) { 639dc007f05SJeremy L Thompson std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 640dc007f05SJeremy L Thompson 6410183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix 64299421279SJeremy L Thompson << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 6438b97b69aSJeremy L Thompson } else { 644dc007f05SJeremy L Thompson std::string function_name = 6450ccda8ebSJeremy L Thompson is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 6460ccda8ebSJeremy L Thompson std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened")) 647412e5683SJeremy L Thompson : "InterpTransposeNonTensor"; 648c433aabcSJeremy L Thompson std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 649dc007f05SJeremy L Thompson 6500183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q" 651c433aabcSJeremy L Thompson << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 6528b97b69aSJeremy L Thompson } 6534b3e95d5SJeremy L Thompson break; 6544b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 6550183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 6568b97b69aSJeremy L Thompson if (is_at_points) { 657dc007f05SJeremy L Thompson std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d"; 658dc007f05SJeremy L Thompson 6590183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix 66099421279SJeremy L Thompson << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 6618b97b69aSJeremy L Thompson } else if (use_3d_slices) { 6620ccda8ebSJeremy L Thompson std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") + 6630ccda8ebSJeremy L Thompson std::to_string(dim) + "d"; 664dc007f05SJeremy L Thompson 6650183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix 66699421279SJeremy L Thompson << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 667dc007f05SJeremy L Thompson } else if (is_tensor) { 6680ccda8ebSJeremy L Thompson bool is_collocated_grad = dim == 3 && Q_1d >= P_1d; 6690ccda8ebSJeremy L Thompson std::string function_name = 6700ccda8ebSJeremy L Thompson (dim == 1 ? "GradTranspose" 6710ccda8ebSJeremy L Thompson : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) + 672412e5683SJeremy L Thompson std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"); 673c433aabcSJeremy L Thompson std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name); 674dc007f05SJeremy L Thompson 6750183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q" 676c433aabcSJeremy L Thompson << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 6774b3e95d5SJeremy L Thompson } else { 678dc007f05SJeremy L Thompson std::string function_name = "GradTransposeNonTensor"; 679dc007f05SJeremy L Thompson 6800183ed61SJeremy L Thompson code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name 681c433aabcSJeremy L Thompson << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n"; 6824b3e95d5SJeremy L Thompson } 6834b3e95d5SJeremy L Thompson break; 6844b3e95d5SJeremy L Thompson // LCOV_EXCL_START 6854b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 6864b3e95d5SJeremy L Thompson break; // Should not occur 6874b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 6884b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 6894b3e95d5SJeremy L Thompson break; // TODO: Not implemented 6904b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 6914b3e95d5SJeremy L Thompson } 6924b3e95d5SJeremy L Thompson } 693681d0ea7SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 6944b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 6954b3e95d5SJeremy L Thompson } 6964b3e95d5SJeremy L Thompson 6974b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 6984b3e95d5SJeremy L Thompson // QFunction 6994b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 7000183ed61SJeremy L Thompson static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, Tab &tab, CeedInt max_dim, 7010183ed61SJeremy L Thompson CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 7028b97b69aSJeremy L Thompson CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, 7038b97b69aSJeremy L Thompson CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields, 704412e5683SJeremy L Thompson std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points, 705*745f16d1SZach Atkins bool use_3d_slices, bool is_assemble) { 706412e5683SJeremy L Thompson std::string Q_name = is_all_tensor ? "Q_1d" : "Q"; 7074b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 7084b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 7094b3e95d5SJeremy L Thompson 7108b97b69aSJeremy L Thompson // Setup output arrays 7110183ed61SJeremy L Thompson code << "\n"; 7120183ed61SJeremy L Thompson code << tab << "// -- Output field setup\n"; 7134b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 71459fa3f92SJeremy L Thompson const char *field_name; 7154b3e95d5SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 7164b3e95d5SJeremy L Thompson 71759fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 7180183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 7194b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 7208b97b69aSJeremy L Thompson switch (eval_mode) { 7218b97b69aSJeremy L Thompson case CEED_EVAL_NONE: 7228b97b69aSJeremy L Thompson if (is_at_points) { 7230183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n"; 7248b97b69aSJeremy L Thompson } else { 7250183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") 726412e5683SJeremy L Thompson << "];\n"; 7274b3e95d5SJeremy L Thompson } 7288b97b69aSJeremy L Thompson break; 7298b97b69aSJeremy L Thompson case CEED_EVAL_INTERP: 7308b97b69aSJeremy L Thompson if (is_at_points) { 7318b97b69aSJeremy L Thompson // Accumulator for point data 7320183ed61SJeremy L Thompson code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; 7330183ed61SJeremy L Thompson code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix 734b8245c6cSJeremy L Thompson << "[i] = 0.0;\n"; 7358b97b69aSJeremy L Thompson } else { 7360183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") 737412e5683SJeremy L Thompson << "];\n"; 7388b97b69aSJeremy L Thompson } 7398b97b69aSJeremy L Thompson break; 7408b97b69aSJeremy L Thompson case CEED_EVAL_GRAD: 7418b97b69aSJeremy L Thompson if (is_at_points) { 7428b97b69aSJeremy L Thompson // Accumulator for point data 7430183ed61SJeremy L Thompson code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n"; 7440183ed61SJeremy L Thompson code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix 745b8245c6cSJeremy L Thompson << "[i] = 0.0;\n"; 7468b97b69aSJeremy L Thompson } else if (use_3d_slices) { 7474b3e95d5SJeremy L Thompson // Accumulator for gradient slices 7480183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 7490183ed61SJeremy L Thompson code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n"; 7504b3e95d5SJeremy L Thompson } else { 7510183ed61SJeremy L Thompson code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*" 752412e5683SJeremy L Thompson << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n"; 7534b3e95d5SJeremy L Thompson } 7548b97b69aSJeremy L Thompson break; 7558b97b69aSJeremy L Thompson case CEED_EVAL_WEIGHT: 7568b97b69aSJeremy L Thompson break; 7578b97b69aSJeremy L Thompson // LCOV_EXCL_START 7588b97b69aSJeremy L Thompson case CEED_EVAL_DIV: 7598b97b69aSJeremy L Thompson case CEED_EVAL_CURL: 7608b97b69aSJeremy L Thompson break; // TODO: Not implemented 7618b97b69aSJeremy L Thompson // LCOV_EXCL_STOP 7624b3e95d5SJeremy L Thompson } 7634b3e95d5SJeremy L Thompson } 7644b3e95d5SJeremy L Thompson 7658b97b69aSJeremy L Thompson if (is_at_points) { 7668b97b69aSJeremy L Thompson // We need to handle batches of points 7670183ed61SJeremy L Thompson code << "\n"; 7680183ed61SJeremy L Thompson code << tab << "// Note: Using batches of points\n"; 7690183ed61SJeremy L Thompson code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n"; 7700183ed61SJeremy L Thompson code << tab << "#pragma unroll\n"; 7710183ed61SJeremy L Thompson code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n"; 7720183ed61SJeremy L Thompson tab.push(); 7730183ed61SJeremy L Thompson code << tab << "const CeedInt p = i % max_num_points;\n\n"; 7748b97b69aSJeremy L Thompson 7750183ed61SJeremy L Thompson code << tab << "// -- Coordinates\n"; 7760183ed61SJeremy L Thompson code << tab << "CeedScalar r_x[max_dim];\n"; 7770183ed61SJeremy L Thompson code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n"; 7788b97b69aSJeremy L Thompson 7790183ed61SJeremy L Thompson code << tab << "// -- Input fields\n"; 7808b97b69aSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 78159fa3f92SJeremy L Thompson const char *field_name; 7828b97b69aSJeremy L Thompson std::string var_suffix = "_in_" + std::to_string(i); 783f725b54bSJeremy L Thompson std::string P_name = "P_1d" + var_suffix; 7848b97b69aSJeremy L Thompson 78559fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 7860183ed61SJeremy L Thompson code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 7878b97b69aSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 7888b97b69aSJeremy L Thompson // Basis action 7890183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 7908b97b69aSJeremy L Thompson switch (eval_mode) { 7918b97b69aSJeremy L Thompson case CEED_EVAL_NONE: 7920183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 7930183ed61SJeremy L Thompson code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 7948b97b69aSJeremy L Thompson << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 7958b97b69aSJeremy L Thompson break; 7968b97b69aSJeremy L Thompson case CEED_EVAL_INTERP: 7970183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 7980183ed61SJeremy L Thompson code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 799412e5683SJeremy L Thompson << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; 8008b97b69aSJeremy L Thompson break; 8018b97b69aSJeremy L Thompson case CEED_EVAL_GRAD: 8020183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 8030183ed61SJeremy L Thompson code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 804412e5683SJeremy L Thompson << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n"; 8058b97b69aSJeremy L Thompson break; 8068b97b69aSJeremy L Thompson case CEED_EVAL_WEIGHT: 8070183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; 8080183ed61SJeremy L Thompson code << tab << "r_s" << var_suffix << "[0] = 1.0;\n"; 8098b97b69aSJeremy L Thompson break; 8108b97b69aSJeremy L Thompson // LCOV_EXCL_START 8118b97b69aSJeremy L Thompson case CEED_EVAL_DIV: 8128b97b69aSJeremy L Thompson case CEED_EVAL_CURL: 8138b97b69aSJeremy L Thompson break; // TODO: Not implemented 8148b97b69aSJeremy L Thompson // LCOV_EXCL_STOP 8158b97b69aSJeremy L Thompson } 8168b97b69aSJeremy L Thompson } 8170183ed61SJeremy L Thompson code << "\n"; 8180183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 8198b97b69aSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 82059fa3f92SJeremy L Thompson const char *field_name; 8218b97b69aSJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 8228b97b69aSJeremy L Thompson 82359fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 8240183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 8258b97b69aSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 8268b97b69aSJeremy L Thompson // Basis action 8278b97b69aSJeremy L Thompson switch (eval_mode) { 8288b97b69aSJeremy L Thompson case CEED_EVAL_NONE: 8290183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 8308b97b69aSJeremy L Thompson break; 8318b97b69aSJeremy L Thompson case CEED_EVAL_INTERP: 8320183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 8338b97b69aSJeremy L Thompson break; 8348b97b69aSJeremy L Thompson case CEED_EVAL_GRAD: 8350183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 8368b97b69aSJeremy L Thompson break; 8378b97b69aSJeremy L Thompson // LCOV_EXCL_START 8388b97b69aSJeremy L Thompson case CEED_EVAL_WEIGHT: 8398b97b69aSJeremy L Thompson break; // Should not occur 8408b97b69aSJeremy L Thompson case CEED_EVAL_DIV: 8418b97b69aSJeremy L Thompson case CEED_EVAL_CURL: 8428b97b69aSJeremy L Thompson break; // TODO: Not implemented 8438b97b69aSJeremy L Thompson // LCOV_EXCL_STOP 8448b97b69aSJeremy L Thompson } 8458b97b69aSJeremy L Thompson } 8468b97b69aSJeremy L Thompson 8478b97b69aSJeremy L Thompson } else if (use_3d_slices) { 8484b3e95d5SJeremy L Thompson // We treat quadrature points per slice in 3d to save registers 8490183ed61SJeremy L Thompson code << "\n"; 8500183ed61SJeremy L Thompson code << tab << "// Note: Using planes of 3D elements\n"; 8510183ed61SJeremy L Thompson code << tab << "#pragma unroll\n"; 8520183ed61SJeremy L Thompson code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n"; 8530183ed61SJeremy L Thompson tab.push(); 8540183ed61SJeremy L Thompson code << tab << "// -- Input fields\n"; 8554b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 85659fa3f92SJeremy L Thompson const char *field_name; 8574b3e95d5SJeremy L Thompson std::string var_suffix = "_in_" + std::to_string(i); 8584b3e95d5SJeremy L Thompson 85959fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 8600183ed61SJeremy L Thompson code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 8614b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 8624b3e95d5SJeremy L Thompson // Basis action 8630183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 8644b3e95d5SJeremy L Thompson switch (eval_mode) { 8654b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 8664b3e95d5SJeremy L Thompson bool is_strided; 8674b3e95d5SJeremy L Thompson 8680183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 8694b3e95d5SJeremy L Thompson 8704b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 8714b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 8724b3e95d5SJeremy L Thompson if (is_strided) { 8734b3e95d5SJeremy L Thompson bool has_backend_strides; 8744b3e95d5SJeremy L Thompson CeedInt num_elem, elem_size; 8754b3e95d5SJeremy L Thompson 8764b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 8774b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 8784b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 8794b3e95d5SJeremy L Thompson CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 8804b3e95d5SJeremy L Thompson 8814b3e95d5SJeremy L Thompson if (!has_backend_strides) { 8824b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 8834b3e95d5SJeremy L Thompson } 8840183ed61SJeremy L Thompson code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1] 8850183ed61SJeremy L Thompson << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n"; 8860183ed61SJeremy L Thompson code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides" 8870183ed61SJeremy L Thompson << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n"; 8884b3e95d5SJeremy L Thompson } else { 8894b3e95d5SJeremy L Thompson CeedSize l_size = 0; 8904b3e95d5SJeremy L Thompson CeedInt comp_stride; 8914b3e95d5SJeremy L Thompson CeedElemRestriction_Cuda *rstr_data; 8924b3e95d5SJeremy L Thompson 8934b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 8940183ed61SJeremy L Thompson code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 8954b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 8960183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 8974b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 8984b3e95d5SJeremy L Thompson data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 8990183ed61SJeremy L Thompson code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size" 9000183ed61SJeremy L Thompson << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 9014b3e95d5SJeremy L Thompson } 902681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 9034b3e95d5SJeremy L Thompson break; 9044b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 9050183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 9060183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n"; 9070183ed61SJeremy L Thompson tab.push(); 9080ccda8ebSJeremy L Thompson code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n"; 9090183ed61SJeremy L Thompson tab.pop(); 9100183ed61SJeremy L Thompson code << tab << "}\n"; 9114b3e95d5SJeremy L Thompson break; 9124b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 9130183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 9140183ed61SJeremy L Thompson code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G" 91599421279SJeremy L Thompson << var_suffix << ", r_s" << var_suffix << ");\n"; 9164b3e95d5SJeremy L Thompson break; 9174b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 9180183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[1];\n"; 9190183ed61SJeremy L Thompson code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n"; 9208b97b69aSJeremy L Thompson break; 9214b3e95d5SJeremy L Thompson // LCOV_EXCL_START 9224b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 9234b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 9244b3e95d5SJeremy L Thompson break; // TODO: Not implemented 9254b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 9264b3e95d5SJeremy L Thompson } 9274b3e95d5SJeremy L Thompson } 9280183ed61SJeremy L Thompson code << "\n"; 9290183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 9304b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 93159fa3f92SJeremy L Thompson const char *field_name; 9324b3e95d5SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 9334b3e95d5SJeremy L Thompson 93459fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 9350183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 9364b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 9374b3e95d5SJeremy L Thompson // Basis action 9384b3e95d5SJeremy L Thompson switch (eval_mode) { 9394b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 9400183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 9418b97b69aSJeremy L Thompson break; 9424b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 9430183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 9444b3e95d5SJeremy L Thompson break; 9454b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 9460183ed61SJeremy L Thompson code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n"; 9474b3e95d5SJeremy L Thompson break; 9484b3e95d5SJeremy L Thompson // LCOV_EXCL_START 9494b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 9504b3e95d5SJeremy L Thompson break; // Should not occur 9514b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 9524b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 9534b3e95d5SJeremy L Thompson break; // TODO: Not implemented 9544b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 9554b3e95d5SJeremy L Thompson } 9564b3e95d5SJeremy L Thompson } 9574b3e95d5SJeremy L Thompson } else { 9580183ed61SJeremy L Thompson code << "\n"; 9590183ed61SJeremy L Thompson code << tab << "// Note: Using full elements\n"; 9600183ed61SJeremy L Thompson code << tab << "{\n"; 9610183ed61SJeremy L Thompson tab.push(); 9620183ed61SJeremy L Thompson code << tab << "// -- Input fields\n"; 9634b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 96459fa3f92SJeremy L Thompson const char *field_name; 96559fa3f92SJeremy L Thompson 96659fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 9670183ed61SJeremy L Thompson code << tab << "// ---- Input field " << i << ": " << field_name << "\n"; 9680183ed61SJeremy L Thompson code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n"; 9694b3e95d5SJeremy L Thompson } 9700183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 9714b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 97259fa3f92SJeremy L Thompson const char *field_name; 97359fa3f92SJeremy L Thompson 97459fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 9750183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 9760183ed61SJeremy L Thompson code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n"; 9774b3e95d5SJeremy L Thompson } 9784b3e95d5SJeremy L Thompson } 9794b3e95d5SJeremy L Thompson 9804b3e95d5SJeremy L Thompson // Input and output buffers 9810183ed61SJeremy L Thompson code << "\n"; 9820183ed61SJeremy L Thompson code << tab << "// -- QFunction inputs and outputs\n"; 9830183ed61SJeremy L Thompson code << tab << "// ---- Inputs\n"; 9840183ed61SJeremy L Thompson code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; 9854b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 98659fa3f92SJeremy L Thompson const char *field_name; 98759fa3f92SJeremy L Thompson 98859fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name)); 9890183ed61SJeremy L Thompson code << tab << "// ------ Input field " << i << ": " << field_name << "\n"; 9900183ed61SJeremy L Thompson code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n"; 9914b3e95d5SJeremy L Thompson } 9920183ed61SJeremy L Thompson code << tab << "// ---- Outputs\n"; 9930183ed61SJeremy L Thompson code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; 9944b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 99559fa3f92SJeremy L Thompson const char *field_name; 99659fa3f92SJeremy L Thompson 99759fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 9980183ed61SJeremy L Thompson code << tab << "// ------ Output field " << i << ": " << field_name << "\n"; 9990183ed61SJeremy L Thompson code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n"; 10004b3e95d5SJeremy L Thompson } 10014b3e95d5SJeremy L Thompson 10024b3e95d5SJeremy L Thompson // Apply QFunction 10030183ed61SJeremy L Thompson code << "\n"; 10040183ed61SJeremy L Thompson code << tab << "// -- Apply QFunction\n"; 10050183ed61SJeremy L Thompson code << tab << "" << qfunction_name << "(ctx, "; 1006412e5683SJeremy L Thompson if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 10074b3e95d5SJeremy L Thompson code << "1"; 10084b3e95d5SJeremy L Thompson } else { 1009dc007f05SJeremy L Thompson code << Q_name; 10104b3e95d5SJeremy L Thompson } 10114b3e95d5SJeremy L Thompson code << ", inputs, outputs);\n"; 10124b3e95d5SJeremy L Thompson 10138b97b69aSJeremy L Thompson if (is_at_points) { 10148b97b69aSJeremy L Thompson // Map back to coefficients 10150183ed61SJeremy L Thompson code << "\n"; 10160183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 10178b97b69aSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 101859fa3f92SJeremy L Thompson const char *field_name; 10198b97b69aSJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 10208b97b69aSJeremy L Thompson std::string P_name = "P_1d" + var_suffix; 10218b97b69aSJeremy L Thompson 102259fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 10230183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 10248b97b69aSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 10258b97b69aSJeremy L Thompson // Basis action 10260183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 10278b97b69aSJeremy L Thompson switch (eval_mode) { 10288b97b69aSJeremy L Thompson case CEED_EVAL_NONE: { 10298b97b69aSJeremy L Thompson CeedInt comp_stride; 10308b97b69aSJeremy L Thompson CeedElemRestriction elem_rstr; 10318b97b69aSJeremy L Thompson 1032*745f16d1SZach Atkins if (is_assemble) break; 10338b97b69aSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 10348b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 10358b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 10360183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 10370183ed61SJeremy L Thompson code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 10388b97b69aSJeremy L Thompson << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]" 10398b97b69aSJeremy L Thompson << ", r_s" << var_suffix << ", d" << var_suffix << ");\n"; 10408b97b69aSJeremy L Thompson break; 10418b97b69aSJeremy L Thompson } 10428b97b69aSJeremy L Thompson case CEED_EVAL_INTERP: 10430183ed61SJeremy L Thompson code << tab << "if (i >= points.num_per_elem[elem]) {\n"; 10440183ed61SJeremy L Thompson tab.push(); 10450183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 10460183ed61SJeremy L Thompson tab.pop(); 10470183ed61SJeremy L Thompson code << tab << "}\n"; 10480183ed61SJeremy L Thompson code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 1049f725b54bSJeremy L Thompson << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 10508b97b69aSJeremy L Thompson break; 10518b97b69aSJeremy L Thompson case CEED_EVAL_GRAD: 10520183ed61SJeremy L Thompson code << tab << "if (i >= points.num_per_elem[elem]) {\n"; 10530183ed61SJeremy L Thompson tab.push(); 10540183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 10550183ed61SJeremy L Thompson tab.pop(); 10560183ed61SJeremy L Thompson code << tab << "}\n"; 10570183ed61SJeremy L Thompson code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name 1058f725b54bSJeremy L Thompson << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 10598b97b69aSJeremy L Thompson break; 10608b97b69aSJeremy L Thompson // LCOV_EXCL_START 10618b97b69aSJeremy L Thompson case CEED_EVAL_WEIGHT: 10628b97b69aSJeremy L Thompson break; // Should not occur 10638b97b69aSJeremy L Thompson case CEED_EVAL_DIV: 10648b97b69aSJeremy L Thompson case CEED_EVAL_CURL: 10658b97b69aSJeremy L Thompson break; // TODO: Not implemented 10668b97b69aSJeremy L Thompson // LCOV_EXCL_STOP 10678b97b69aSJeremy L Thompson } 10688b97b69aSJeremy L Thompson } 10698b97b69aSJeremy L Thompson } else if (use_3d_slices) { 10704b3e95d5SJeremy L Thompson // Copy or apply transpose grad, if needed 10710183ed61SJeremy L Thompson code << "\n"; 10720183ed61SJeremy L Thompson code << tab << "// -- Output fields\n"; 10734b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 107459fa3f92SJeremy L Thompson const char *field_name; 10754b3e95d5SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 10764b3e95d5SJeremy L Thompson std::string P_name = "P_1d" + var_suffix; 10774b3e95d5SJeremy L Thompson 107859fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 10790183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 10804b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 10814b3e95d5SJeremy L Thompson // Basis action 10820183ed61SJeremy L Thompson code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 10834b3e95d5SJeremy L Thompson switch (eval_mode) { 10844b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 10850183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 10860183ed61SJeremy L Thompson tab.push(); 10870183ed61SJeremy L Thompson code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 10880183ed61SJeremy L Thompson tab.pop(); 10890183ed61SJeremy L Thompson code << tab << "}\n"; 10908b97b69aSJeremy L Thompson break; 10914b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 10920183ed61SJeremy L Thompson code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 10930183ed61SJeremy L Thompson tab.push(); 10940183ed61SJeremy L Thompson code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 10950183ed61SJeremy L Thompson tab.pop(); 10960183ed61SJeremy L Thompson code << tab << "}\n"; 10974b3e95d5SJeremy L Thompson break; 10984b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 10990183ed61SJeremy L Thompson code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G" 1100f815fac9SJeremy L Thompson << var_suffix << ", r_q" << var_suffix << ");\n"; 11014b3e95d5SJeremy L Thompson break; 11024b3e95d5SJeremy L Thompson // LCOV_EXCL_START 11034b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 11044b3e95d5SJeremy L Thompson break; // Should not occur 11054b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 11064b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 11074b3e95d5SJeremy L Thompson break; // TODO: Not implemented 11084b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 11094b3e95d5SJeremy L Thompson } 11104b3e95d5SJeremy L Thompson } 11114b3e95d5SJeremy L Thompson } 11120183ed61SJeremy L Thompson tab.pop(); 11130183ed61SJeremy L Thompson code << tab << "}\n"; 11144b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 11154b3e95d5SJeremy L Thompson } 11164b3e95d5SJeremy L Thompson 11174b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 1118b2165e7aSSebastian Grimberg // Build single operator kernel 1119ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1120ddae5012SJeremy L Thompson extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) { 1121412e5683SJeremy L Thompson bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; 1122241a4b83SYohann Ceed ceed; 1123efa41df3SJeremy L Thompson CeedInt Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; 1124ca735530SJeremy L Thompson CeedQFunctionField *qf_input_fields, *qf_output_fields; 1125ca735530SJeremy L Thompson CeedQFunction_Cuda_gen *qf_data; 1126ca735530SJeremy L Thompson CeedQFunction qf; 1127ca735530SJeremy L Thompson CeedOperatorField *op_input_fields, *op_output_fields; 1128ca735530SJeremy L Thompson CeedOperator_Cuda_gen *data; 11294b3e95d5SJeremy L Thompson std::ostringstream code; 11300183ed61SJeremy L Thompson Tab tab; 11314b3e95d5SJeremy L Thompson 1132ddae5012SJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &data)); 11334b3e95d5SJeremy L Thompson { 11344b3e95d5SJeremy L Thompson bool is_setup_done; 1135ca735530SJeremy L Thompson 1136ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 1137ddae5012SJeremy L Thompson if (is_setup_done) { 1138ddae5012SJeremy L Thompson *is_good_build = !data->use_fallback; 1139ddae5012SJeremy L Thompson return CEED_ERROR_SUCCESS; 1140ddae5012SJeremy L Thompson } 1141ddae5012SJeremy L Thompson } 1142ddae5012SJeremy L Thompson 1143ddae5012SJeremy L Thompson // Check field compatibility 11448d12f40eSJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 1145ddae5012SJeremy L Thompson { 1146412e5683SJeremy L Thompson bool has_shared_bases = true; 1147ddae5012SJeremy L Thompson 1148ddae5012SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1149ddae5012SJeremy L Thompson CeedBasis basis; 1150ddae5012SJeremy L Thompson 1151ddae5012SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 1152ddae5012SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 1153ddae5012SJeremy L Thompson bool is_tensor = true; 1154ddae5012SJeremy L Thompson const char *resource; 1155ddae5012SJeremy L Thompson char *resource_root; 1156ddae5012SJeremy L Thompson Ceed basis_ceed; 1157ddae5012SJeremy L Thompson 1158ddae5012SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 1159c9192acaSJeremy L Thompson is_all_tensor = is_all_tensor && is_tensor; 116045a787f7SJeremy L Thompson is_all_nontensor = is_all_nontensor && !is_tensor; 1161ddae5012SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 1162ddae5012SJeremy L Thompson CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 1163ddae5012SJeremy L Thompson CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 1164c9192acaSJeremy L Thompson has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared"); 1165ddae5012SJeremy L Thompson CeedCallBackend(CeedFree(&resource_root)); 1166ddae5012SJeremy L Thompson CeedCallBackend(CeedDestroy(&basis_ceed)); 1167ddae5012SJeremy L Thompson } 1168ddae5012SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 11694b3e95d5SJeremy L Thompson } 1170ca735530SJeremy L Thompson 1171ddae5012SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 1172ddae5012SJeremy L Thompson CeedBasis basis; 1173ddae5012SJeremy L Thompson 1174ddae5012SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 1175ddae5012SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 1176ddae5012SJeremy L Thompson bool is_tensor = true; 1177ddae5012SJeremy L Thompson const char *resource; 1178ddae5012SJeremy L Thompson char *resource_root; 1179ddae5012SJeremy L Thompson Ceed basis_ceed; 1180ddae5012SJeremy L Thompson 1181ddae5012SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 1182c9192acaSJeremy L Thompson is_all_tensor = is_all_tensor && is_tensor; 1183c9192acaSJeremy L Thompson is_all_nontensor = is_all_nontensor && !is_tensor; 1184ddae5012SJeremy L Thompson 1185ddae5012SJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 1186ddae5012SJeremy L Thompson CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 1187ddae5012SJeremy L Thompson CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 1188c9192acaSJeremy L Thompson has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared"); 1189ddae5012SJeremy L Thompson CeedCallBackend(CeedFree(&resource_root)); 1190ddae5012SJeremy L Thompson CeedCallBackend(CeedDestroy(&basis_ceed)); 1191ddae5012SJeremy L Thompson } 1192ddae5012SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 1193ddae5012SJeremy L Thompson } 1194ddae5012SJeremy L Thompson // -- Fallback to ref if not all bases are shared 1195412e5683SJeremy L Thompson if (!has_shared_bases) { 1196ddae5012SJeremy L Thompson *is_good_build = false; 1197ddae5012SJeremy L Thompson return CEED_ERROR_SUCCESS; 1198ddae5012SJeremy L Thompson } 1199ddae5012SJeremy L Thompson } 1200ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 1201ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 1202ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 1203ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 1204241a4b83SYohann 12054b3e95d5SJeremy L Thompson // Get operator data 12068b97b69aSJeremy L Thompson CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 1207412e5683SJeremy L Thompson { 1208efa41df3SJeremy L Thompson CeedInt max_P = 0, max_P_1d = 0; 1209412e5683SJeremy L Thompson 1210412e5683SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, 1211412e5683SJeremy L Thompson op_output_fields, qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, 1212412e5683SJeremy L Thompson &use_3d_slices)); 1213412e5683SJeremy L Thompson data->max_P_1d = is_all_tensor ? max_P_1d : max_P; 1214412e5683SJeremy L Thompson } 12158b97b69aSJeremy L Thompson if (is_at_points) { 1216e31b7a9fSJeremy L Thompson CeedInt coords_dim = 0; 12178b97b69aSJeremy L Thompson CeedElemRestriction_Cuda *rstr_data; 12188b97b69aSJeremy L Thompson CeedElemRestriction rstr_points = NULL; 12194b3e95d5SJeremy L Thompson 12208b97b69aSJeremy L Thompson CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 12218b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 12228b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 1223e31b7a9fSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim)); 12248b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data)); 12258b97b69aSJeremy L Thompson data->points.indices = (CeedInt *)rstr_data->d_offsets; 12268b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 1227e31b7a9fSJeremy L Thompson if (max_dim == 0) max_dim = coords_dim; 1228e31b7a9fSJeremy L Thompson if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim)); 12298b97b69aSJeremy L Thompson } 1230e31b7a9fSJeremy L Thompson if (max_dim == 0) max_dim = 1; 1231e31b7a9fSJeremy L Thompson data->dim = max_dim; 12328b97b69aSJeremy L Thompson if (is_at_points) use_3d_slices = false; 12338b97b69aSJeremy L Thompson if (Q_1d == 0) { 12348b97b69aSJeremy L Thompson if (is_at_points) Q_1d = max_num_points; 12358b97b69aSJeremy L Thompson else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d)); 12364b3e95d5SJeremy L Thompson } 1237c433aabcSJeremy L Thompson if (Q == 0) Q = Q_1d; 1238c433aabcSJeremy L Thompson data->Q = Q; 12394b3e95d5SJeremy L Thompson data->Q_1d = Q_1d; 12404b3e95d5SJeremy L Thompson 12410b454692Sjeremylt // Check for restriction only identity operator 12424b3e95d5SJeremy L Thompson { 12434b3e95d5SJeremy L Thompson bool is_identity_qf; 12444b3e95d5SJeremy L Thompson 12452b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 12460b454692Sjeremylt if (is_identity_qf) { 12479e201c85SYohann CeedEvalMode eval_mode_in, eval_mode_out; 1248ca735530SJeremy L Thompson 12492b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); 12502b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); 12516574a04fSJeremy L Thompson CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, 12526574a04fSJeremy L Thompson "Backend does not implement restriction only identity operators"); 12530b454692Sjeremylt } 12544b3e95d5SJeremy L Thompson } 12550b454692Sjeremylt 1256f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 12574b3e95d5SJeremy L Thompson { 12584b3e95d5SJeremy L Thompson Ceed_Cuda *ceed_data; 12594b3e95d5SJeremy L Thompson struct cudaDeviceProp prop; 12604b3e95d5SJeremy L Thompson 12612b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &ceed_data)); 12622b730f8bSJeremy L Thompson CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); 126380a9ef05SNatalie Beams if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { 12640183ed61SJeremy L Thompson code << tab << "// AtomicAdd fallback source\n"; 12650183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n"; 1266f1a13f77SYohann Dudouit } 12674b3e95d5SJeremy L Thompson } 1268f1a13f77SYohann Dudouit 12699e201c85SYohann // Load basis source files 1270412e5683SJeremy L Thompson if (!is_all_nontensor) { 12710183ed61SJeremy L Thompson code << tab << "// Tensor basis source\n"; 12720183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n"; 1273412e5683SJeremy L Thompson } 1274412e5683SJeremy L Thompson if (!is_all_tensor) { 12750183ed61SJeremy L Thompson code << tab << "// Non-tensor basis source\n"; 12760183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n"; 1277dc007f05SJeremy L Thompson } 1278c8e372f0SJeremy L Thompson if (!is_all_tensor && !is_all_nontensor) { 1279c8e372f0SJeremy L Thompson code << "// Tensor basis source\n"; 1280c8e372f0SJeremy L Thompson code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h>\n\n"; 1281c8e372f0SJeremy L Thompson } 1282dc007f05SJeremy L Thompson if (is_at_points) { 12838b97b69aSJeremy L Thompson code << "// AtPoints basis source\n"; 12848b97b69aSJeremy L Thompson code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n"; 1285dc007f05SJeremy L Thompson } 12869c25dd66SJeremy L Thompson code << "// CodeGen operator source\n"; 12879c25dd66SJeremy L Thompson code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n"; 1288241a4b83SYohann 12894b3e95d5SJeremy L Thompson // Get QFunction name 12904b3e95d5SJeremy L Thompson std::string qfunction_name(qf_data->qfunction_name); 12914b3e95d5SJeremy L Thompson std::string operator_name; 12924b3e95d5SJeremy L Thompson 129309095acaSJeremy L Thompson operator_name = "CeedKernelCudaGenOperator_" + qfunction_name; 12944d537eeaSYohann 12959e201c85SYohann // Define CEED_Q_VLA 12960183ed61SJeremy L Thompson code << "\n" << tab << "#undef CEED_Q_VLA\n"; 1297412e5683SJeremy L Thompson if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 12980183ed61SJeremy L Thompson code << tab << "#define CEED_Q_VLA 1\n\n"; 12999e201c85SYohann } else { 13000183ed61SJeremy L Thompson code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 13019e201c85SYohann } 13029e201c85SYohann 13034b3e95d5SJeremy L Thompson // Add user QFunction source 13044b3e95d5SJeremy L Thompson { 13059c25dd66SJeremy L Thompson const char *source_path; 13064b3e95d5SJeremy L Thompson 13079c25dd66SJeremy L Thompson CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 13089c25dd66SJeremy L Thompson CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file"); 13099c25dd66SJeremy L Thompson 13100183ed61SJeremy L Thompson code << tab << "// User QFunction source\n"; 13110183ed61SJeremy L Thompson code << tab << "#include \"" << source_path << "\"\n\n"; 13124b3e95d5SJeremy L Thompson } 1313241a4b83SYohann 1314241a4b83SYohann // Setup 13150183ed61SJeremy L Thompson code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 13160183ed61SJeremy L Thompson code << tab << "// Operator Kernel\n"; 13170183ed61SJeremy L Thompson code << tab << "// \n"; 13180183ed61SJeremy L Thompson code << tab << "// d_[in,out]_i: CeedVector device array\n"; 13190183ed61SJeremy L Thompson code << tab << "// r_[in,out]_e_i: Element vector register\n"; 13200183ed61SJeremy L Thompson code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 13210183ed61SJeremy L Thompson code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 13220183ed61SJeremy L Thompson code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 13230183ed61SJeremy L Thompson code << tab << "// \n"; 13240183ed61SJeremy L Thompson code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 13250183ed61SJeremy L Thompson code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 13260183ed61SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n"; 13270183ed61SJeremy L Thompson code << tab << "extern \"C\" __global__ void " << operator_name 13288b97b69aSJeremy L Thompson << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " 13298b97b69aSJeremy L Thompson "points) {\n"; 13300183ed61SJeremy L Thompson tab.push(); 13314b3e95d5SJeremy L Thompson 13324b3e95d5SJeremy L Thompson // Scratch buffers 13339e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 13344b3e95d5SJeremy L Thompson CeedEvalMode eval_mode; 13354b3e95d5SJeremy L Thompson 13362b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 13379e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 13380183ed61SJeremy L Thompson code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 1339241a4b83SYohann } 1340241a4b83SYohann } 13419e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 13420183ed61SJeremy L Thompson code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 1343241a4b83SYohann } 13441da99368SJeremy L Thompson 13450183ed61SJeremy L Thompson code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 1346412e5683SJeremy L Thompson if (!is_all_tensor) { 13470183ed61SJeremy L Thompson code << tab << "const CeedInt Q = " << Q << ";\n"; 1348412e5683SJeremy L Thompson } 1349412e5683SJeremy L Thompson if (!is_all_nontensor) { 13500183ed61SJeremy L Thompson code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 1351412e5683SJeremy L Thompson } 13528b97b69aSJeremy L Thompson if (is_at_points) { 13530183ed61SJeremy L Thompson code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; 13540183ed61SJeremy L Thompson code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 13558b97b69aSJeremy L Thompson } 13561da99368SJeremy L Thompson 13574b3e95d5SJeremy L Thompson // Shared data 13580183ed61SJeremy L Thompson code << tab << "extern __shared__ CeedScalar slice[];\n"; 13590183ed61SJeremy L Thompson code << tab << "SharedData_Cuda data;\n"; 13600183ed61SJeremy L Thompson code << tab << "data.t_id_x = threadIdx.x;\n"; 13610183ed61SJeremy L Thompson code << tab << "data.t_id_y = threadIdx.y;\n"; 13620183ed61SJeremy L Thompson code << tab << "data.t_id_z = threadIdx.z;\n"; 13630183ed61SJeremy L Thompson code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 13640183ed61SJeremy L Thompson code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 1365920dcdc4Sjeremylt 13660a2a6492SJeremy L Thompson // -- Determine input mat reuse 136745a787f7SJeremy L Thompson FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX]; 13680a2a6492SJeremy L Thompson 13690a2a6492SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 137045a787f7SJeremy L Thompson input_matrix_reuse[i].index = -1; 13710a2a6492SJeremy L Thompson } 13720a2a6492SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1373412e5683SJeremy L Thompson bool is_tensor = true; 13740a2a6492SJeremy L Thompson CeedEvalMode eval_mode_i; 13750a2a6492SJeremy L Thompson CeedBasis basis_i; 13760a2a6492SJeremy L Thompson 13770a2a6492SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 13780a2a6492SJeremy L Thompson if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 13790a2a6492SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 1380412e5683SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 138145a787f7SJeremy L Thompson for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 13820a2a6492SJeremy L Thompson CeedEvalMode eval_mode_j; 13830a2a6492SJeremy L Thompson CeedBasis basis_j; 13840a2a6492SJeremy L Thompson 13850a2a6492SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 13860a2a6492SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 13870a2a6492SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 13880a2a6492SJeremy L Thompson if (basis_i == basis_j) { 13890a2a6492SJeremy L Thompson if (is_tensor) { 139045a787f7SJeremy L Thompson input_matrix_reuse[i].index = j; 139145a787f7SJeremy L Thompson input_matrix_reuse[i].is_input = true; 139245a787f7SJeremy L Thompson input_matrix_reuse[i].eval_mode = eval_mode_j; 13930a2a6492SJeremy L Thompson } else { 13940a2a6492SJeremy L Thompson // For non-tensor can only re-use with the same eval mode 13950a2a6492SJeremy L Thompson if (eval_mode_i == eval_mode_j) { 139645a787f7SJeremy L Thompson input_matrix_reuse[i].index = j; 139745a787f7SJeremy L Thompson input_matrix_reuse[i].is_input = true; 139845a787f7SJeremy L Thompson input_matrix_reuse[i].eval_mode = eval_mode_j; 13990a2a6492SJeremy L Thompson } 14000a2a6492SJeremy L Thompson } 14010a2a6492SJeremy L Thompson } 14020a2a6492SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 14030a2a6492SJeremy L Thompson } 14040a2a6492SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 14050a2a6492SJeremy L Thompson } 14060a2a6492SJeremy L Thompson 14070a2a6492SJeremy L Thompson // -- Determine output mat reuse 140845a787f7SJeremy L Thompson FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX]; 14090a2a6492SJeremy L Thompson 14100a2a6492SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 141145a787f7SJeremy L Thompson output_matrix_reuse[i].index = -1; 14120a2a6492SJeremy L Thompson } 14130a2a6492SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 1414412e5683SJeremy L Thompson bool is_tensor = true; 14150a2a6492SJeremy L Thompson CeedEvalMode eval_mode_i; 14160a2a6492SJeremy L Thompson CeedBasis basis_i; 14170a2a6492SJeremy L Thompson 14180a2a6492SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 14190a2a6492SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 1420412e5683SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 142145a787f7SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 14220a2a6492SJeremy L Thompson CeedEvalMode eval_mode_j; 14230a2a6492SJeremy L Thompson CeedBasis basis_j; 14240a2a6492SJeremy L Thompson 14250a2a6492SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 14260a2a6492SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 14270a2a6492SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 14280a2a6492SJeremy L Thompson if (basis_i == basis_j) { 14290a2a6492SJeremy L Thompson if (is_tensor) { 143045a787f7SJeremy L Thompson output_matrix_reuse[i].index = j; 143145a787f7SJeremy L Thompson output_matrix_reuse[i].is_input = true; 143245a787f7SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 14330a2a6492SJeremy L Thompson } else { 14340a2a6492SJeremy L Thompson // For non-tensor can only re-use with the same eval mode 14350a2a6492SJeremy L Thompson if (eval_mode_i == eval_mode_j) { 143645a787f7SJeremy L Thompson output_matrix_reuse[i].index = j; 143745a787f7SJeremy L Thompson output_matrix_reuse[i].is_input = true; 143845a787f7SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 14390a2a6492SJeremy L Thompson } 14400a2a6492SJeremy L Thompson } 14410a2a6492SJeremy L Thompson } 14420a2a6492SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 14430a2a6492SJeremy L Thompson } 144445a787f7SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 14450a2a6492SJeremy L Thompson CeedEvalMode eval_mode_j; 14460a2a6492SJeremy L Thompson CeedBasis basis_j; 14470a2a6492SJeremy L Thompson 14480a2a6492SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 14490a2a6492SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 14500a2a6492SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 14510a2a6492SJeremy L Thompson if (basis_i == basis_j) { 14520a2a6492SJeremy L Thompson if (is_tensor) { 145345a787f7SJeremy L Thompson output_matrix_reuse[i].index = j; 145445a787f7SJeremy L Thompson output_matrix_reuse[i].is_input = false; 145545a787f7SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 14560a2a6492SJeremy L Thompson } else { 14570a2a6492SJeremy L Thompson // For non-tensor can only re-use with the same eval mode 14580a2a6492SJeremy L Thompson if (eval_mode_i == eval_mode_j) { 145945a787f7SJeremy L Thompson output_matrix_reuse[i].index = j; 146045a787f7SJeremy L Thompson output_matrix_reuse[i].is_input = false; 146145a787f7SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 14620a2a6492SJeremy L Thompson } 14630a2a6492SJeremy L Thompson } 14640a2a6492SJeremy L Thompson } 14650a2a6492SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 14660a2a6492SJeremy L Thompson } 14670a2a6492SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 14680a2a6492SJeremy L Thompson } 14690a2a6492SJeremy L Thompson 1470ac421f39SYohann // Initialize constants, and matrices B and G 14710183ed61SJeremy L Thompson code << "\n" << tab << "// Input field constants and basis data\n"; 14729e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 14730183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 1474ca1da9b9SJeremy L Thompson max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); 1475920dcdc4Sjeremylt } 14760183ed61SJeremy L Thompson code << "\n" << tab << "// Output field constants and basis data\n"; 14779e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 14780183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 1479ca1da9b9SJeremy L Thompson max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); 14804b3e95d5SJeremy L Thompson } 1481920dcdc4Sjeremylt 14824b3e95d5SJeremy L Thompson // Loop over all elements 14830183ed61SJeremy L Thompson code << "\n" << tab << "// Element loop\n"; 14840183ed61SJeremy L Thompson code << tab << "__syncthreads();\n"; 14850183ed61SJeremy L Thompson code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 14860183ed61SJeremy L Thompson tab.push(); 14874b3e95d5SJeremy L Thompson 1488e93651e5SJeremy L Thompson // -- Compute minimum buffer space needed 14898b97b69aSJeremy L Thompson CeedInt max_rstr_buffer_size = 1; 1490e93651e5SJeremy L Thompson 14919e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 14926de40545SJeremy L Thompson CeedEvalMode eval_mode; 14936de40545SJeremy L Thompson 14946de40545SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 14956de40545SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 1496a61b1c91SJeremy L Thompson CeedInt num_comp; 1497e93651e5SJeremy L Thompson CeedElemRestriction elem_rstr; 1498e93651e5SJeremy L Thompson 1499e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 1500e93651e5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1501a61b1c91SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1502681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1503e93651e5SJeremy L Thompson } 15046de40545SJeremy L Thompson } 1505e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 15066de40545SJeremy L Thompson CeedEvalMode eval_mode; 15076de40545SJeremy L Thompson 15086de40545SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 15096de40545SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE) { 1510a61b1c91SJeremy L Thompson CeedInt num_comp; 1511e93651e5SJeremy L Thompson CeedElemRestriction elem_rstr; 1512e93651e5SJeremy L Thompson 1513e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1514e93651e5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1515a61b1c91SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 1516681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1517e93651e5SJeremy L Thompson } 15186de40545SJeremy L Thompson } 15190183ed61SJeremy L Thompson code << tab << "// Scratch restriction buffer space\n"; 15200183ed61SJeremy L Thompson code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 1521e93651e5SJeremy L Thompson 1522e93651e5SJeremy L Thompson // -- Determine best input field processing order 1523e93651e5SJeremy L Thompson CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 1524e93651e5SJeremy L Thompson 1525e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1526e93651e5SJeremy L Thompson field_rstr_in_buffer[i] = -1; 1527e93651e5SJeremy L Thompson input_field_order[i] = -1; 1528e93651e5SJeremy L Thompson } 1529e93651e5SJeremy L Thompson { 1530e93651e5SJeremy L Thompson bool is_ordered[CEED_FIELD_MAX]; 1531e93651e5SJeremy L Thompson CeedInt curr_index = 0; 1532e93651e5SJeremy L Thompson 1533e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 1534e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1535e93651e5SJeremy L Thompson CeedVector vec_i; 1536e93651e5SJeremy L Thompson CeedElemRestriction rstr_i; 1537e93651e5SJeremy L Thompson 1538e93651e5SJeremy L Thompson if (is_ordered[i]) continue; 1539e93651e5SJeremy L Thompson field_rstr_in_buffer[i] = i; 1540e93651e5SJeremy L Thompson is_ordered[i] = true; 1541e93651e5SJeremy L Thompson input_field_order[curr_index] = i; 1542e93651e5SJeremy L Thompson curr_index++; 1543034f99fdSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 1544e93651e5SJeremy L Thompson if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 1545e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 1546e93651e5SJeremy L Thompson for (CeedInt j = i + 1; j < num_input_fields; j++) { 1547e93651e5SJeremy L Thompson CeedVector vec_j; 1548e93651e5SJeremy L Thompson CeedElemRestriction rstr_j; 1549e93651e5SJeremy L Thompson 1550e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 1551e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 1552e93651e5SJeremy L Thompson if (rstr_i == rstr_j && vec_i == vec_j) { 1553e93651e5SJeremy L Thompson field_rstr_in_buffer[j] = i; 1554e93651e5SJeremy L Thompson is_ordered[j] = true; 1555e93651e5SJeremy L Thompson input_field_order[curr_index] = j; 1556e93651e5SJeremy L Thompson curr_index++; 1557e93651e5SJeremy L Thompson } 1558681d0ea7SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_j)); 1559681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 1560e93651e5SJeremy L Thompson } 1561681d0ea7SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_i)); 1562681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 1563e93651e5SJeremy L Thompson } 1564e93651e5SJeremy L Thompson } 1565e93651e5SJeremy L Thompson 1566e93651e5SJeremy L Thompson // -- Input restriction and basis 15670183ed61SJeremy L Thompson code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 1568e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 156959fa3f92SJeremy L Thompson const char *field_name; 157059fa3f92SJeremy L Thompson const CeedInt f = input_field_order[i]; 1571e93651e5SJeremy L Thompson 157259fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 15730183ed61SJeremy L Thompson code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 1574920dcdc4Sjeremylt 15754b3e95d5SJeremy L Thompson // ---- Restriction 15760183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 15770183ed61SJeremy L Thompson max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 15789a2291e3SJeremy L Thompson 15794b3e95d5SJeremy L Thompson // ---- Basis action 15800183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 15810183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 1582920dcdc4Sjeremylt } 1583920dcdc4Sjeremylt 15844b3e95d5SJeremy L Thompson // -- Q function 15850183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 15860183ed61SJeremy L Thompson qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 1587*745f16d1SZach Atkins Q_1d, is_all_tensor, is_at_points, use_3d_slices, false)); 1588ca735530SJeremy L Thompson 15894b3e95d5SJeremy L Thompson // -- Output basis and restriction 15900183ed61SJeremy L Thompson code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 15919e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 159259fa3f92SJeremy L Thompson const char *field_name; 159359fa3f92SJeremy L Thompson 159459fa3f92SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 15950183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 1596ca735530SJeremy L Thompson 15974b3e95d5SJeremy L Thompson // ---- Basis action 15980183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, 1599412e5683SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 16009a2291e3SJeremy L Thompson 16014b3e95d5SJeremy L Thompson // ---- Restriction 16020183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, 16030183ed61SJeremy L Thompson false, is_all_tensor, is_at_points, use_3d_slices)); 1604ac421f39SYohann } 1605241a4b83SYohann 16064b3e95d5SJeremy L Thompson // Close loop and function 16070183ed61SJeremy L Thompson tab.pop(); 16080183ed61SJeremy L Thompson code << tab << "}\n"; 16090183ed61SJeremy L Thompson tab.pop(); 16100183ed61SJeremy L Thompson code << tab << "}\n"; 16110183ed61SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n\n"; 1612241a4b83SYohann 16133a2968d6SJeremy L Thompson // Compile 1614ddae5012SJeremy L Thompson { 1615ddae5012SJeremy L Thompson bool is_compile_good = false; 1616412e5683SJeremy L Thompson const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); 1617ddae5012SJeremy L Thompson 1618a61b1c91SJeremy L Thompson data->thread_1d = T_1d; 1619412e5683SJeremy L Thompson CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "OP_T_1D", T_1d)); 1620ddae5012SJeremy L Thompson if (is_compile_good) { 1621ddae5012SJeremy L Thompson *is_good_build = true; 1622eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op)); 1623ddae5012SJeremy L Thompson } else { 1624ddae5012SJeremy L Thompson *is_good_build = false; 1625ddae5012SJeremy L Thompson data->use_fallback = true; 1626ddae5012SJeremy L Thompson } 1627ddae5012SJeremy L Thompson } 16282b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetSetupDone(op)); 16299bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 1630c11e12f4SJeremy L Thompson CeedCallBackend(CeedQFunctionDestroy(&qf)); 1631e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1632241a4b83SYohann } 16332a86cc9dSSebastian Grimberg 1634ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 16350183ed61SJeremy L Thompson // Build AtPoints assembly operator kernel 16360183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 16370183ed61SJeremy L Thompson static int CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(CeedOperator op, bool is_full, bool *is_good_build) { 16380183ed61SJeremy L Thompson bool is_all_tensor = true, is_at_points = false, use_3d_slices = false; 16390183ed61SJeremy L Thompson Ceed ceed; 16400183ed61SJeremy L Thompson CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0; 16410183ed61SJeremy L Thompson CeedQFunctionField *qf_input_fields, *qf_output_fields; 16420183ed61SJeremy L Thompson CeedQFunction_Cuda_gen *qf_data; 16430183ed61SJeremy L Thompson CeedQFunction qf; 16440183ed61SJeremy L Thompson CeedOperatorField *op_input_fields, *op_output_fields; 16450183ed61SJeremy L Thompson CeedOperator_Cuda_gen *data; 16460183ed61SJeremy L Thompson std::ostringstream code; 16470183ed61SJeremy L Thompson Tab tab; 16480183ed61SJeremy L Thompson 16490183ed61SJeremy L Thompson // Check compatibility 16500183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 16510183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 16520183ed61SJeremy L Thompson CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported"); 16530183ed61SJeremy L Thompson 16540183ed61SJeremy L Thompson // Retrieve operator data 16550183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &data)); 16560183ed61SJeremy L Thompson Q = data->Q; 16570183ed61SJeremy L Thompson Q_1d = data->Q_1d; 16580183ed61SJeremy L Thompson max_dim = data->dim; 16590183ed61SJeremy L Thompson { 16600183ed61SJeremy L Thompson CeedElemRestriction rstr_points = NULL; 16610183ed61SJeremy L Thompson 16620183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 16630183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 16640183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 16650183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 16660183ed61SJeremy L Thompson } 16670183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 16680183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 16690183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 16700183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 16710183ed61SJeremy L Thompson 16720183ed61SJeremy L Thompson // Add atomicAdd function for old NVidia architectures 16730183ed61SJeremy L Thompson { 16740183ed61SJeremy L Thompson Ceed_Cuda *ceed_data; 16750183ed61SJeremy L Thompson struct cudaDeviceProp prop; 16760183ed61SJeremy L Thompson 16770183ed61SJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &ceed_data)); 16780183ed61SJeremy L Thompson CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); 16790183ed61SJeremy L Thompson if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { 16800183ed61SJeremy L Thompson code << tab << "// AtomicAdd fallback source\n"; 16810183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n"; 16820183ed61SJeremy L Thompson } 16830183ed61SJeremy L Thompson } 16840183ed61SJeremy L Thompson 16850183ed61SJeremy L Thompson // Load basis source files 16860183ed61SJeremy L Thompson code << tab << "// Tensor basis source\n"; 16870183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n"; 16880183ed61SJeremy L Thompson code << tab << "// AtPoints basis source\n"; 16890183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n"; 16900183ed61SJeremy L Thompson code << tab << "// CodeGen operator source\n"; 16910183ed61SJeremy L Thompson code << tab << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n"; 16920183ed61SJeremy L Thompson 16930183ed61SJeremy L Thompson // Get QFunction name 16940183ed61SJeremy L Thompson std::string qfunction_name(qf_data->qfunction_name); 16950183ed61SJeremy L Thompson std::string operator_name; 16960183ed61SJeremy L Thompson 16970183ed61SJeremy L Thompson if (is_full) { 16980183ed61SJeremy L Thompson operator_name = "CeedKernelCudaGenOperatorFullAssembly_" + qfunction_name; 16990183ed61SJeremy L Thompson } else { 17000183ed61SJeremy L Thompson operator_name = "CeedKernelCudaGenOperatorDiagonalAssembly_" + qfunction_name; 17010183ed61SJeremy L Thompson } 17020183ed61SJeremy L Thompson 17030183ed61SJeremy L Thompson // Define CEED_Q_VLA 17040183ed61SJeremy L Thompson code << "\n" << tab << "#undef CEED_Q_VLA\n"; 17050183ed61SJeremy L Thompson code << tab << "#define CEED_Q_VLA 1\n\n"; 17060183ed61SJeremy L Thompson 17070183ed61SJeremy L Thompson // Add user QFunction source 17080183ed61SJeremy L Thompson { 17090183ed61SJeremy L Thompson const char *source_path; 17100183ed61SJeremy L Thompson 17110183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 17120183ed61SJeremy L Thompson CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file"); 17130183ed61SJeremy L Thompson 17140183ed61SJeremy L Thompson code << tab << "// User QFunction source\n"; 17150183ed61SJeremy L Thompson code << tab << "#include \"" << source_path << "\"\n\n"; 17160183ed61SJeremy L Thompson } 17170183ed61SJeremy L Thompson 17180183ed61SJeremy L Thompson // Setup 17190183ed61SJeremy L Thompson code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 17200183ed61SJeremy L Thompson code << tab << "// Operator Assembly Kernel\n"; 17210183ed61SJeremy L Thompson code << tab << "// \n"; 17220183ed61SJeremy L Thompson code << tab << "// d_[in,out]_i: CeedVector device array\n"; 17230183ed61SJeremy L Thompson code << tab << "// r_[in,out]_e_i: Element vector register\n"; 17240183ed61SJeremy L Thompson code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 17250183ed61SJeremy L Thompson code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 17260183ed61SJeremy L Thompson code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 17270183ed61SJeremy L Thompson code << tab << "// \n"; 17280183ed61SJeremy L Thompson code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 17290183ed61SJeremy L Thompson code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 17300183ed61SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n"; 17310183ed61SJeremy L Thompson code << tab << "extern \"C\" __global__ void " << operator_name 17320183ed61SJeremy L Thompson << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " 17330183ed61SJeremy L Thompson "points, CeedScalar *__restrict__ values_array) {\n"; 17340183ed61SJeremy L Thompson tab.push(); 17350183ed61SJeremy L Thompson 17360183ed61SJeremy L Thompson // Scratch buffers 17370183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 17380183ed61SJeremy L Thompson CeedEvalMode eval_mode; 17390183ed61SJeremy L Thompson 17400183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 17410183ed61SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 17420183ed61SJeremy L Thompson code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 17430183ed61SJeremy L Thompson } 17440183ed61SJeremy L Thompson } 17450183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 17460183ed61SJeremy L Thompson code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 17470183ed61SJeremy L Thompson } 17480183ed61SJeremy L Thompson 17490183ed61SJeremy L Thompson code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 17500183ed61SJeremy L Thompson code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 17510183ed61SJeremy L Thompson code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n"; 17520183ed61SJeremy L Thompson code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 17530183ed61SJeremy L Thompson 17540183ed61SJeremy L Thompson // Shared data 17550183ed61SJeremy L Thompson code << tab << "extern __shared__ CeedScalar slice[];\n"; 17560183ed61SJeremy L Thompson code << tab << "SharedData_Cuda data;\n"; 17570183ed61SJeremy L Thompson code << tab << "data.t_id_x = threadIdx.x;\n"; 17580183ed61SJeremy L Thompson code << tab << "data.t_id_y = threadIdx.y;\n"; 17590183ed61SJeremy L Thompson code << tab << "data.t_id_z = threadIdx.z;\n"; 17600183ed61SJeremy L Thompson code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 17610183ed61SJeremy L Thompson code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 17620183ed61SJeremy L Thompson 17630183ed61SJeremy L Thompson // -- Determine input mat reuse 17640183ed61SJeremy L Thompson FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX]; 17650183ed61SJeremy L Thompson 17660183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 17670183ed61SJeremy L Thompson input_matrix_reuse[i].index = -1; 17680183ed61SJeremy L Thompson } 17690183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 17700183ed61SJeremy L Thompson CeedEvalMode eval_mode_i; 17710183ed61SJeremy L Thompson CeedBasis basis_i; 17720183ed61SJeremy L Thompson 17730183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 17740183ed61SJeremy L Thompson if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 17750183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 17760183ed61SJeremy L Thompson for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 17770183ed61SJeremy L Thompson CeedEvalMode eval_mode_j; 17780183ed61SJeremy L Thompson CeedBasis basis_j; 17790183ed61SJeremy L Thompson 17800183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 17810183ed61SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 17820183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 17830183ed61SJeremy L Thompson if (basis_i == basis_j) { 17840183ed61SJeremy L Thompson input_matrix_reuse[i].index = j; 17850183ed61SJeremy L Thompson input_matrix_reuse[i].is_input = true; 17860183ed61SJeremy L Thompson input_matrix_reuse[i].eval_mode = eval_mode_j; 17870183ed61SJeremy L Thompson } 17880183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 17890183ed61SJeremy L Thompson } 17900183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 17910183ed61SJeremy L Thompson } 17920183ed61SJeremy L Thompson 17930183ed61SJeremy L Thompson // -- Determine output mat reuse 17940183ed61SJeremy L Thompson FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX]; 17950183ed61SJeremy L Thompson 17960183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 17970183ed61SJeremy L Thompson output_matrix_reuse[i].index = -1; 17980183ed61SJeremy L Thompson } 17990183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 18000183ed61SJeremy L Thompson CeedEvalMode eval_mode_i; 18010183ed61SJeremy L Thompson CeedBasis basis_i; 18020183ed61SJeremy L Thompson 18030183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 18040183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 18050183ed61SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 18060183ed61SJeremy L Thompson CeedEvalMode eval_mode_j; 18070183ed61SJeremy L Thompson CeedBasis basis_j; 18080183ed61SJeremy L Thompson 18090183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 18100183ed61SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 18110183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 18120183ed61SJeremy L Thompson if (basis_i == basis_j) { 18130183ed61SJeremy L Thompson output_matrix_reuse[i].index = j; 18140183ed61SJeremy L Thompson output_matrix_reuse[i].is_input = true; 18150183ed61SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 18160183ed61SJeremy L Thompson } 18170183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 18180183ed61SJeremy L Thompson } 18190183ed61SJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 18200183ed61SJeremy L Thompson CeedEvalMode eval_mode_j; 18210183ed61SJeremy L Thompson CeedBasis basis_j; 18220183ed61SJeremy L Thompson 18230183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 18240183ed61SJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 18250183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 18260183ed61SJeremy L Thompson if (basis_i == basis_j) { 18270183ed61SJeremy L Thompson output_matrix_reuse[i].index = j; 18280183ed61SJeremy L Thompson output_matrix_reuse[i].is_input = false; 18290183ed61SJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 18300183ed61SJeremy L Thompson } 18310183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 18320183ed61SJeremy L Thompson } 18330183ed61SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 18340183ed61SJeremy L Thompson } 18350183ed61SJeremy L Thompson 18360183ed61SJeremy L Thompson // Initialize constants, and matrices B and G 18370183ed61SJeremy L Thompson code << "\n" << tab << "// Input field constants and basis data\n"; 18380183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 18390183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 1840ca1da9b9SJeremy L Thompson max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false)); 18410183ed61SJeremy L Thompson } 18420183ed61SJeremy L Thompson code << "\n" << tab << "// Output field constants and basis data\n"; 18430183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 18440183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 1845ca1da9b9SJeremy L Thompson max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false)); 18460183ed61SJeremy L Thompson } 18470183ed61SJeremy L Thompson 18480183ed61SJeremy L Thompson // Loop over all elements 18490183ed61SJeremy L Thompson code << "\n" << tab << "// Element loop\n"; 18500183ed61SJeremy L Thompson code << tab << "__syncthreads();\n"; 18510183ed61SJeremy L Thompson code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 18520183ed61SJeremy L Thompson tab.push(); 18530183ed61SJeremy L Thompson 18540183ed61SJeremy L Thompson // -- Compute minimum buffer space needed 18550183ed61SJeremy L Thompson CeedInt max_rstr_buffer_size = 1; 18560183ed61SJeremy L Thompson 18570183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 18580183ed61SJeremy L Thompson CeedEvalMode eval_mode; 18590183ed61SJeremy L Thompson 18600183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 18610183ed61SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 18620183ed61SJeremy L Thompson CeedInt num_comp; 18630183ed61SJeremy L Thompson CeedElemRestriction elem_rstr; 18640183ed61SJeremy L Thompson 18650183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 18660183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 18670183ed61SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 18680183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 18690183ed61SJeremy L Thompson } 18700183ed61SJeremy L Thompson } 18710183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 18720183ed61SJeremy L Thompson CeedEvalMode eval_mode; 18730183ed61SJeremy L Thompson 18740183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 18750183ed61SJeremy L Thompson if (eval_mode != CEED_EVAL_NONE) { 18760183ed61SJeremy L Thompson CeedInt num_comp; 18770183ed61SJeremy L Thompson CeedElemRestriction elem_rstr; 18780183ed61SJeremy L Thompson 18790183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 18800183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 18810183ed61SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 18820183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 18830183ed61SJeremy L Thompson } 18840183ed61SJeremy L Thompson } 18850183ed61SJeremy L Thompson code << tab << "// Scratch restriction buffer space\n"; 18860183ed61SJeremy L Thompson code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 18870183ed61SJeremy L Thompson 18880183ed61SJeremy L Thompson // -- Determine best input field processing order 18890183ed61SJeremy L Thompson CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 18900183ed61SJeremy L Thompson 18910183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 18920183ed61SJeremy L Thompson field_rstr_in_buffer[i] = -1; 18930183ed61SJeremy L Thompson input_field_order[i] = -1; 18940183ed61SJeremy L Thompson } 18950183ed61SJeremy L Thompson { 18960183ed61SJeremy L Thompson bool is_ordered[CEED_FIELD_MAX]; 18970183ed61SJeremy L Thompson CeedInt curr_index = 0; 18980183ed61SJeremy L Thompson 18990183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 19000183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 19010183ed61SJeremy L Thompson CeedVector vec_i; 19020183ed61SJeremy L Thompson CeedElemRestriction rstr_i; 19030183ed61SJeremy L Thompson 19040183ed61SJeremy L Thompson if (is_ordered[i]) continue; 19050183ed61SJeremy L Thompson field_rstr_in_buffer[i] = i; 19060183ed61SJeremy L Thompson is_ordered[i] = true; 19070183ed61SJeremy L Thompson input_field_order[curr_index] = i; 19080183ed61SJeremy L Thompson curr_index++; 19090183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 19100183ed61SJeremy L Thompson if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 19110183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 19120183ed61SJeremy L Thompson for (CeedInt j = i + 1; j < num_input_fields; j++) { 19130183ed61SJeremy L Thompson CeedVector vec_j; 19140183ed61SJeremy L Thompson CeedElemRestriction rstr_j; 19150183ed61SJeremy L Thompson 19160183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 19170183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 19180183ed61SJeremy L Thompson if (rstr_i == rstr_j && vec_i == vec_j) { 19190183ed61SJeremy L Thompson field_rstr_in_buffer[j] = i; 19200183ed61SJeremy L Thompson is_ordered[j] = true; 19210183ed61SJeremy L Thompson input_field_order[curr_index] = j; 19220183ed61SJeremy L Thompson curr_index++; 19230183ed61SJeremy L Thompson } 19240183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_j)); 19250183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 19260183ed61SJeremy L Thompson } 19270183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_i)); 19280183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 19290183ed61SJeremy L Thompson } 19300183ed61SJeremy L Thompson } 19310183ed61SJeremy L Thompson 19320183ed61SJeremy L Thompson // -- Input restriction and basis 19330183ed61SJeremy L Thompson code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 19340183ed61SJeremy L Thompson CeedInt active_field_index = -1; 19350183ed61SJeremy L Thompson 19360183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 19370183ed61SJeremy L Thompson bool is_active = false; 19380183ed61SJeremy L Thompson const char *field_name; 19390183ed61SJeremy L Thompson const CeedInt f = input_field_order[i]; 19400183ed61SJeremy L Thompson 19410183ed61SJeremy L Thompson { 19420183ed61SJeremy L Thompson CeedVector vec; 19430183ed61SJeremy L Thompson 19440183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 19450183ed61SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 19460183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 19470183ed61SJeremy L Thompson } 19480183ed61SJeremy L Thompson 19490183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 19500183ed61SJeremy L Thompson code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 19510183ed61SJeremy L Thompson 19520183ed61SJeremy L Thompson if (is_active) { 19530183ed61SJeremy L Thompson std::string var_suffix = "_in_" + std::to_string(f); 19540183ed61SJeremy L Thompson 19550183ed61SJeremy L Thompson code << tab << "// Active field - no restriction or basis action here\n"; 19560183ed61SJeremy L Thompson if (active_field_index == -1) { 19570183ed61SJeremy L Thompson active_field_index = f; 19580183ed61SJeremy L Thompson code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1") 19590183ed61SJeremy L Thompson << "] = {0.0};\n"; 19600183ed61SJeremy L Thompson } else { 19610183ed61SJeremy L Thompson code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n"; 19620183ed61SJeremy L Thompson } 19630183ed61SJeremy L Thompson } else { 19640183ed61SJeremy L Thompson // ---- Restriction 19650183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 19660183ed61SJeremy L Thompson max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 19670183ed61SJeremy L Thompson 19680183ed61SJeremy L Thompson // ---- Basis action 19690183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 19700183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 19710183ed61SJeremy L Thompson } 19720183ed61SJeremy L Thompson } 19730183ed61SJeremy L Thompson 19740183ed61SJeremy L Thompson // -- Loop over active field 19750183ed61SJeremy L Thompson std::string active_var_suffix = "_in_" + std::to_string(active_field_index); 19760183ed61SJeremy L Thompson 19770183ed61SJeremy L Thompson code << "\n" << tab << "// Loop over nodes in active field\n"; 19780183ed61SJeremy L Thompson code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix 19790183ed61SJeremy L Thompson << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n"; 19800183ed61SJeremy L Thompson tab.push(); 19810183ed61SJeremy L Thompson 19820183ed61SJeremy L Thompson // -- Set current active node and component to 1 19830183ed61SJeremy L Thompson code << tab << "// Set current active node and component to 1.0\n"; 19840183ed61SJeremy L Thompson code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e" 19850183ed61SJeremy L Thompson << active_var_suffix << ");\n\n"; 19860183ed61SJeremy L Thompson 19870183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 19880183ed61SJeremy L Thompson bool is_active = false; 19890183ed61SJeremy L Thompson const char *field_name; 19900183ed61SJeremy L Thompson const CeedInt f = input_field_order[i]; 19910183ed61SJeremy L Thompson 19920183ed61SJeremy L Thompson { 19930183ed61SJeremy L Thompson CeedVector vec; 19940183ed61SJeremy L Thompson 19950183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 19960183ed61SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 19970183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 19980183ed61SJeremy L Thompson } 19990183ed61SJeremy L Thompson if (!is_active) continue; 20000183ed61SJeremy L Thompson 20010183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 20020183ed61SJeremy L Thompson code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 20030183ed61SJeremy L Thompson 20040183ed61SJeremy L Thompson // ---- Basis action 20050183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 20060183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 20070183ed61SJeremy L Thompson } 20080183ed61SJeremy L Thompson 20090183ed61SJeremy L Thompson // -- Q function 20100183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 20110183ed61SJeremy L Thompson qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 2012*745f16d1SZach Atkins Q_1d, is_all_tensor, is_at_points, use_3d_slices, true)); 20130183ed61SJeremy L Thompson 20140183ed61SJeremy L Thompson // -- Output basis and restriction 20150183ed61SJeremy L Thompson code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 20160183ed61SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 20170183ed61SJeremy L Thompson bool is_active = false; 20180183ed61SJeremy L Thompson const char *field_name; 20190183ed61SJeremy L Thompson 20200183ed61SJeremy L Thompson { 20210183ed61SJeremy L Thompson CeedVector vec; 20220183ed61SJeremy L Thompson 20230183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 20240183ed61SJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 20250183ed61SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 20260183ed61SJeremy L Thompson } 20270183ed61SJeremy L Thompson if (!is_active) continue; 20280183ed61SJeremy L Thompson 20290183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 20300183ed61SJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 20310183ed61SJeremy L Thompson 20320183ed61SJeremy L Thompson // ---- Basis action 20330183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false, 20340183ed61SJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 20350183ed61SJeremy L Thompson 20360183ed61SJeremy L Thompson // ---- Restriction 20370183ed61SJeremy L Thompson if (is_full) { 2038915834c9SZach Atkins std::string var_suffix = "_out_" + std::to_string(i); 2039915834c9SZach Atkins CeedInt comp_stride; 2040915834c9SZach Atkins CeedSize l_size; 2041915834c9SZach Atkins CeedElemRestriction elem_rstr; 2042915834c9SZach Atkins 2043915834c9SZach Atkins CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 2044915834c9SZach Atkins CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 2045915834c9SZach Atkins code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 2046915834c9SZach Atkins CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 2047915834c9SZach Atkins code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 2048915834c9SZach Atkins code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix 2049915834c9SZach Atkins << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n"; 2050915834c9SZach Atkins CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 20510183ed61SJeremy L Thompson } else { 20520183ed61SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 20530183ed61SJeremy L Thompson CeedInt comp_stride; 20540183ed61SJeremy L Thompson CeedSize l_size; 20550183ed61SJeremy L Thompson CeedElemRestriction elem_rstr; 20560183ed61SJeremy L Thompson 20570183ed61SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 20580183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 20590183ed61SJeremy L Thompson code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 20600183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 20610183ed61SJeremy L Thompson code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 20620183ed61SJeremy L Thompson code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix 20630183ed61SJeremy L Thompson << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n"; 20640183ed61SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 20650183ed61SJeremy L Thompson } 20660183ed61SJeremy L Thompson } 20670183ed61SJeremy L Thompson 20680183ed61SJeremy L Thompson // -- Reset current active node and component 20690183ed61SJeremy L Thompson code << "\n" << tab << "// Reset current active node and component to 0.0\n"; 20700183ed61SJeremy L Thompson code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e" 20710183ed61SJeremy L Thompson << active_var_suffix << ");\n"; 20720183ed61SJeremy L Thompson 20730183ed61SJeremy L Thompson // -- End of loop over active field 20740183ed61SJeremy L Thompson tab.pop(); 20750183ed61SJeremy L Thompson code << tab << "}\n"; 20760183ed61SJeremy L Thompson 20770183ed61SJeremy L Thompson // Close loop and function 20780183ed61SJeremy L Thompson tab.pop(); 20790183ed61SJeremy L Thompson code << tab << "}\n"; 20800183ed61SJeremy L Thompson tab.pop(); 20810183ed61SJeremy L Thompson code << tab << "}\n"; 20820183ed61SJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n\n"; 20830183ed61SJeremy L Thompson 20840183ed61SJeremy L Thompson // Compile 20850183ed61SJeremy L Thompson { 20860183ed61SJeremy L Thompson bool is_compile_good = false; 20870183ed61SJeremy L Thompson const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); 20880183ed61SJeremy L Thompson 20890183ed61SJeremy L Thompson data->thread_1d = T_1d; 20900183ed61SJeremy L Thompson CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, 20910183ed61SJeremy L Thompson is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 1, "OP_T_1D", T_1d)); 20920183ed61SJeremy L Thompson if (is_compile_good) { 20930183ed61SJeremy L Thompson *is_good_build = true; 20940183ed61SJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(), 20950183ed61SJeremy L Thompson is_full ? &data->assemble_full : &data->assemble_diagonal)); 20960183ed61SJeremy L Thompson } else { 20970183ed61SJeremy L Thompson *is_good_build = false; 20980183ed61SJeremy L Thompson data->use_assembly_fallback = true; 20990183ed61SJeremy L Thompson } 21000183ed61SJeremy L Thompson } 21010183ed61SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 21020183ed61SJeremy L Thompson CeedCallBackend(CeedQFunctionDestroy(&qf)); 21030183ed61SJeremy L Thompson return CEED_ERROR_SUCCESS; 21040183ed61SJeremy L Thompson } 21050183ed61SJeremy L Thompson 21060183ed61SJeremy L Thompson extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build) { 21070183ed61SJeremy L Thompson return CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(op, false, is_good_build); 21080183ed61SJeremy L Thompson } 21090183ed61SJeremy L Thompson 2110915834c9SZach Atkins extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Cuda_gen(CeedOperator op, bool *is_good_build) { 2111915834c9SZach Atkins return CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen(op, true, is_good_build); 2112915834c9SZach Atkins } 2113915834c9SZach Atkins 21140183ed61SJeremy L Thompson //------------------------------------------------------------------------------ 21150816752eSJeremy L Thompson // Build QFunction assembly operator kernel 21160816752eSJeremy L Thompson //------------------------------------------------------------------------------ 21170816752eSJeremy L Thompson extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen(CeedOperator op, bool *is_good_build) { 21180816752eSJeremy L Thompson bool is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false; 21190816752eSJeremy L Thompson Ceed ceed; 21200816752eSJeremy L Thompson CeedInt Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0; 21210816752eSJeremy L Thompson CeedQFunctionField *qf_input_fields, *qf_output_fields; 21220816752eSJeremy L Thompson CeedQFunction_Cuda_gen *qf_data; 21230816752eSJeremy L Thompson CeedQFunction qf; 21240816752eSJeremy L Thompson CeedOperatorField *op_input_fields, *op_output_fields; 21250816752eSJeremy L Thompson CeedOperator_Cuda_gen *data; 21260816752eSJeremy L Thompson std::ostringstream code; 21270816752eSJeremy L Thompson Tab tab; 21280816752eSJeremy L Thompson 21290816752eSJeremy L Thompson // Check compatibility 21300816752eSJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 21310816752eSJeremy L Thompson CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 21320816752eSJeremy L Thompson CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported"); 21330816752eSJeremy L Thompson 21340816752eSJeremy L Thompson // Check field compatibility 21350816752eSJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 21360816752eSJeremy L Thompson { 21370816752eSJeremy L Thompson bool has_shared_bases = true; 21380816752eSJeremy L Thompson 21390816752eSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 21400816752eSJeremy L Thompson CeedBasis basis; 21410816752eSJeremy L Thompson 21420816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 21430816752eSJeremy L Thompson if (basis != CEED_BASIS_NONE) { 21440816752eSJeremy L Thompson bool is_tensor = true; 21450816752eSJeremy L Thompson const char *resource; 21460816752eSJeremy L Thompson char *resource_root; 21470816752eSJeremy L Thompson Ceed basis_ceed; 21480816752eSJeremy L Thompson 21490816752eSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 21500816752eSJeremy L Thompson is_all_tensor = is_all_tensor && is_tensor; 21510816752eSJeremy L Thompson is_all_nontensor = is_all_nontensor && !is_tensor; 21520816752eSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 21530816752eSJeremy L Thompson CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 21540816752eSJeremy L Thompson CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 21550816752eSJeremy L Thompson has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared"); 21560816752eSJeremy L Thompson CeedCallBackend(CeedFree(&resource_root)); 21570816752eSJeremy L Thompson CeedCallBackend(CeedDestroy(&basis_ceed)); 21580816752eSJeremy L Thompson } 21590816752eSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 21600816752eSJeremy L Thompson } 21610816752eSJeremy L Thompson 21620816752eSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 21630816752eSJeremy L Thompson CeedBasis basis; 21640816752eSJeremy L Thompson 21650816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 21660816752eSJeremy L Thompson if (basis != CEED_BASIS_NONE) { 21670816752eSJeremy L Thompson bool is_tensor = true; 21680816752eSJeremy L Thompson const char *resource; 21690816752eSJeremy L Thompson char *resource_root; 21700816752eSJeremy L Thompson Ceed basis_ceed; 21710816752eSJeremy L Thompson 21720816752eSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); 21730816752eSJeremy L Thompson is_all_tensor = is_all_tensor && is_tensor; 21740816752eSJeremy L Thompson is_all_nontensor = is_all_nontensor && !is_tensor; 21750816752eSJeremy L Thompson 21760816752eSJeremy L Thompson CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); 21770816752eSJeremy L Thompson CeedCallBackend(CeedGetResource(basis_ceed, &resource)); 21780816752eSJeremy L Thompson CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); 21790816752eSJeremy L Thompson has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared"); 21800816752eSJeremy L Thompson CeedCallBackend(CeedFree(&resource_root)); 21810816752eSJeremy L Thompson CeedCallBackend(CeedDestroy(&basis_ceed)); 21820816752eSJeremy L Thompson } 21830816752eSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 21840816752eSJeremy L Thompson } 21850816752eSJeremy L Thompson } 21860816752eSJeremy L Thompson 21870816752eSJeremy L Thompson // Retrieve operator data 21880816752eSJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &data)); 21890816752eSJeremy L Thompson Q = data->Q; 21900816752eSJeremy L Thompson Q_1d = data->Q_1d; 21910816752eSJeremy L Thompson max_dim = data->dim; 21920816752eSJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 21930816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 21940816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 21950816752eSJeremy L Thompson 21960816752eSJeremy L Thompson // Add atomicAdd function for old NVidia architectures 21970816752eSJeremy L Thompson { 21980816752eSJeremy L Thompson Ceed_Cuda *ceed_data; 21990816752eSJeremy L Thompson struct cudaDeviceProp prop; 22000816752eSJeremy L Thompson 22010816752eSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &ceed_data)); 22020816752eSJeremy L Thompson CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); 22030816752eSJeremy L Thompson if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { 22040816752eSJeremy L Thompson code << tab << "// AtomicAdd fallback source\n"; 22050816752eSJeremy L Thompson code << tab << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n"; 22060816752eSJeremy L Thompson } 22070816752eSJeremy L Thompson } 22080816752eSJeremy L Thompson 22090816752eSJeremy L Thompson // Load basis source files 22100816752eSJeremy L Thompson if (!is_all_nontensor) { 22110816752eSJeremy L Thompson code << tab << "// Tensor basis source\n"; 22120816752eSJeremy L Thompson code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n"; 22130816752eSJeremy L Thompson } 22140816752eSJeremy L Thompson if (!is_all_tensor) { 22150816752eSJeremy L Thompson code << tab << "// Non-tensor basis source\n"; 22160816752eSJeremy L Thompson code << tab << "#include <ceed/jit-source/cuda/cuda-shared-basis-nontensor-templates.h>\n\n"; 22170816752eSJeremy L Thompson } 22180816752eSJeremy L Thompson if (!is_all_tensor && !is_all_nontensor) { 22190816752eSJeremy L Thompson code << "// Tensor basis source\n"; 22200816752eSJeremy L Thompson code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h>\n\n"; 22210816752eSJeremy L Thompson } 22220816752eSJeremy L Thompson code << "// CodeGen operator source\n"; 22230816752eSJeremy L Thompson code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n"; 22240816752eSJeremy L Thompson 22250816752eSJeremy L Thompson // Get QFunction name 22260816752eSJeremy L Thompson std::string qfunction_name(qf_data->qfunction_name); 22270816752eSJeremy L Thompson std::string operator_name; 22280816752eSJeremy L Thompson 22290816752eSJeremy L Thompson operator_name = "CeedKernelCudaGenQFunctionAssembly_" + qfunction_name; 22300816752eSJeremy L Thompson 22310816752eSJeremy L Thompson // Define CEED_Q_VLA 22320816752eSJeremy L Thompson code << "\n" << tab << "#undef CEED_Q_VLA\n"; 22330816752eSJeremy L Thompson if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) { 22340816752eSJeremy L Thompson code << tab << "#define CEED_Q_VLA 1\n\n"; 22350816752eSJeremy L Thompson } else { 22360816752eSJeremy L Thompson code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 22370816752eSJeremy L Thompson } 22380816752eSJeremy L Thompson 22390816752eSJeremy L Thompson // Add user QFunction source 22400816752eSJeremy L Thompson { 22410816752eSJeremy L Thompson const char *source_path; 22420816752eSJeremy L Thompson 22430816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 22440816752eSJeremy L Thompson CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file"); 22450816752eSJeremy L Thompson 22460816752eSJeremy L Thompson code << tab << "// User QFunction source\n"; 22470816752eSJeremy L Thompson code << tab << "#include \"" << source_path << "\"\n\n"; 22480816752eSJeremy L Thompson } 22490816752eSJeremy L Thompson 22500816752eSJeremy L Thompson // Setup 22510816752eSJeremy L Thompson code << "\n" << tab << "// -----------------------------------------------------------------------------\n"; 22520816752eSJeremy L Thompson code << tab << "// Operator Assembly Kernel\n"; 22530816752eSJeremy L Thompson code << tab << "// \n"; 22540816752eSJeremy L Thompson code << tab << "// d_[in,out]_i: CeedVector device array\n"; 22550816752eSJeremy L Thompson code << tab << "// r_[in,out]_e_i: Element vector register\n"; 22560816752eSJeremy L Thompson code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n"; 22570816752eSJeremy L Thompson code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n"; 22580816752eSJeremy L Thompson code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 22590816752eSJeremy L Thompson code << tab << "// \n"; 22600816752eSJeremy L Thompson code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 22610816752eSJeremy L Thompson code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 22620816752eSJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n"; 22630816752eSJeremy L Thompson code << tab << "extern \"C\" __global__ void " << operator_name 22640816752eSJeremy L Thompson << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " 22650816752eSJeremy L Thompson "points, CeedScalar *__restrict__ values_array) {\n"; 22660816752eSJeremy L Thompson tab.push(); 22670816752eSJeremy L Thompson 22680816752eSJeremy L Thompson // Scratch buffers 22690816752eSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 22700816752eSJeremy L Thompson CeedEvalMode eval_mode; 22710816752eSJeremy L Thompson 22720816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 22730816752eSJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 22740816752eSJeremy L Thompson code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n"; 22750816752eSJeremy L Thompson } 22760816752eSJeremy L Thompson } 22770816752eSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 2278*745f16d1SZach Atkins bool is_active = false; 2279*745f16d1SZach Atkins 2280*745f16d1SZach Atkins { 2281*745f16d1SZach Atkins CeedVector vec; 2282*745f16d1SZach Atkins 2283*745f16d1SZach Atkins CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 2284*745f16d1SZach Atkins is_active = vec == CEED_VECTOR_ACTIVE; 2285*745f16d1SZach Atkins CeedCallBackend(CeedVectorDestroy(&vec)); 2286*745f16d1SZach Atkins } 2287*745f16d1SZach Atkins if (is_active) { 22880816752eSJeremy L Thompson code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n"; 22890816752eSJeremy L Thompson } 2290*745f16d1SZach Atkins } 22910816752eSJeremy L Thompson 22920816752eSJeremy L Thompson code << tab << "const CeedInt max_dim = " << max_dim << ";\n"; 22930816752eSJeremy L Thompson if (!is_all_tensor) { 22940816752eSJeremy L Thompson code << tab << "const CeedInt Q = " << Q << ";\n"; 22950816752eSJeremy L Thompson } 22960816752eSJeremy L Thompson if (!is_all_nontensor) { 22970816752eSJeremy L Thompson code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n"; 22980816752eSJeremy L Thompson } 22990816752eSJeremy L Thompson 23000816752eSJeremy L Thompson // Shared data 23010816752eSJeremy L Thompson code << tab << "extern __shared__ CeedScalar slice[];\n"; 23020816752eSJeremy L Thompson code << tab << "SharedData_Cuda data;\n"; 23030816752eSJeremy L Thompson code << tab << "data.t_id_x = threadIdx.x;\n"; 23040816752eSJeremy L Thompson code << tab << "data.t_id_y = threadIdx.y;\n"; 23050816752eSJeremy L Thompson code << tab << "data.t_id_z = threadIdx.z;\n"; 23060816752eSJeremy L Thompson code << tab << "data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 23070816752eSJeremy L Thompson code << tab << "data.slice = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n"; 23080816752eSJeremy L Thompson 23090816752eSJeremy L Thompson // -- Determine input mat reuse 23100816752eSJeremy L Thompson FieldReuse_Cuda input_matrix_reuse[CEED_FIELD_MAX]; 23110816752eSJeremy L Thompson 23120816752eSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 23130816752eSJeremy L Thompson input_matrix_reuse[i].index = -1; 23140816752eSJeremy L Thompson } 23150816752eSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 23160816752eSJeremy L Thompson bool is_tensor = true; 23170816752eSJeremy L Thompson CeedEvalMode eval_mode_i; 23180816752eSJeremy L Thompson CeedBasis basis_i; 23190816752eSJeremy L Thompson 23200816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); 23210816752eSJeremy L Thompson if (eval_mode_i == CEED_EVAL_WEIGHT) continue; 23220816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); 23230816752eSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 23240816752eSJeremy L Thompson for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) { 23250816752eSJeremy L Thompson CeedEvalMode eval_mode_j; 23260816752eSJeremy L Thompson CeedBasis basis_j; 23270816752eSJeremy L Thompson 23280816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 23290816752eSJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 23300816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 23310816752eSJeremy L Thompson if (basis_i == basis_j) { 23320816752eSJeremy L Thompson if (is_tensor) { 23330816752eSJeremy L Thompson input_matrix_reuse[i].index = j; 23340816752eSJeremy L Thompson input_matrix_reuse[i].is_input = true; 23350816752eSJeremy L Thompson input_matrix_reuse[i].eval_mode = eval_mode_j; 23360816752eSJeremy L Thompson } else { 23370816752eSJeremy L Thompson // For non-tensor can only re-use with the same eval mode 23380816752eSJeremy L Thompson if (eval_mode_i == eval_mode_j) { 23390816752eSJeremy L Thompson input_matrix_reuse[i].index = j; 23400816752eSJeremy L Thompson input_matrix_reuse[i].is_input = true; 23410816752eSJeremy L Thompson input_matrix_reuse[i].eval_mode = eval_mode_j; 23420816752eSJeremy L Thompson } 23430816752eSJeremy L Thompson } 23440816752eSJeremy L Thompson } 23450816752eSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 23460816752eSJeremy L Thompson } 23470816752eSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 23480816752eSJeremy L Thompson } 23490816752eSJeremy L Thompson 23500816752eSJeremy L Thompson // -- Determine output mat reuse 23510816752eSJeremy L Thompson FieldReuse_Cuda output_matrix_reuse[CEED_FIELD_MAX]; 23520816752eSJeremy L Thompson 23530816752eSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 23540816752eSJeremy L Thompson output_matrix_reuse[i].index = -1; 23550816752eSJeremy L Thompson } 23560816752eSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 23570816752eSJeremy L Thompson bool is_tensor = true; 23580816752eSJeremy L Thompson CeedEvalMode eval_mode_i; 23590816752eSJeremy L Thompson CeedBasis basis_i; 23600816752eSJeremy L Thompson 23610816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); 23620816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); 23630816752eSJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor)); 23640816752eSJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) { 23650816752eSJeremy L Thompson CeedEvalMode eval_mode_j; 23660816752eSJeremy L Thompson CeedBasis basis_j; 23670816752eSJeremy L Thompson 23680816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); 23690816752eSJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 23700816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); 23710816752eSJeremy L Thompson if (basis_i == basis_j) { 23720816752eSJeremy L Thompson if (is_tensor) { 23730816752eSJeremy L Thompson output_matrix_reuse[i].index = j; 23740816752eSJeremy L Thompson output_matrix_reuse[i].is_input = true; 23750816752eSJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 23760816752eSJeremy L Thompson } else { 23770816752eSJeremy L Thompson // For non-tensor can only re-use with the same eval mode 23780816752eSJeremy L Thompson if (eval_mode_i == eval_mode_j) { 23790816752eSJeremy L Thompson output_matrix_reuse[i].index = j; 23800816752eSJeremy L Thompson output_matrix_reuse[i].is_input = true; 23810816752eSJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 23820816752eSJeremy L Thompson } 23830816752eSJeremy L Thompson } 23840816752eSJeremy L Thompson } 23850816752eSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 23860816752eSJeremy L Thompson } 23870816752eSJeremy L Thompson for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) { 23880816752eSJeremy L Thompson CeedEvalMode eval_mode_j; 23890816752eSJeremy L Thompson CeedBasis basis_j; 23900816752eSJeremy L Thompson 23910816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); 23920816752eSJeremy L Thompson if (eval_mode_j == CEED_EVAL_WEIGHT) continue; 23930816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); 23940816752eSJeremy L Thompson if (basis_i == basis_j) { 23950816752eSJeremy L Thompson if (is_tensor) { 23960816752eSJeremy L Thompson output_matrix_reuse[i].index = j; 23970816752eSJeremy L Thompson output_matrix_reuse[i].is_input = false; 23980816752eSJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 23990816752eSJeremy L Thompson } else { 24000816752eSJeremy L Thompson // For non-tensor can only re-use with the same eval mode 24010816752eSJeremy L Thompson if (eval_mode_i == eval_mode_j) { 24020816752eSJeremy L Thompson output_matrix_reuse[i].index = j; 24030816752eSJeremy L Thompson output_matrix_reuse[i].is_input = false; 24040816752eSJeremy L Thompson output_matrix_reuse[i].eval_mode = eval_mode_j; 24050816752eSJeremy L Thompson } 24060816752eSJeremy L Thompson } 24070816752eSJeremy L Thompson } 24080816752eSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_j)); 24090816752eSJeremy L Thompson } 24100816752eSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis_i)); 24110816752eSJeremy L Thompson } 24120816752eSJeremy L Thompson 24130816752eSJeremy L Thompson // Initialize constants, and matrices B and G 24140816752eSJeremy L Thompson code << "\n" << tab << "// Input field constants and basis data\n"; 24150816752eSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 24160816752eSJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], 2417ca1da9b9SJeremy L Thompson max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true)); 24180816752eSJeremy L Thompson } 24190816752eSJeremy L Thompson code << "\n" << tab << "// Output field constants and basis data\n"; 24200816752eSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 24210816752eSJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], 2422ca1da9b9SJeremy L Thompson max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true)); 24230816752eSJeremy L Thompson } 24240816752eSJeremy L Thompson 24250816752eSJeremy L Thompson // Loop over all elements 24260816752eSJeremy L Thompson code << "\n" << tab << "// Element loop\n"; 24270816752eSJeremy L Thompson code << tab << "__syncthreads();\n"; 24280816752eSJeremy L Thompson code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 24290816752eSJeremy L Thompson tab.push(); 24300816752eSJeremy L Thompson 24310816752eSJeremy L Thompson // -- Compute minimum buffer space needed 24320816752eSJeremy L Thompson CeedInt max_rstr_buffer_size = 1; 24330816752eSJeremy L Thompson 24340816752eSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 24350816752eSJeremy L Thompson CeedEvalMode eval_mode; 24360816752eSJeremy L Thompson 24370816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 24380816752eSJeremy L Thompson if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) { 24390816752eSJeremy L Thompson CeedInt num_comp; 24400816752eSJeremy L Thompson CeedElemRestriction elem_rstr; 24410816752eSJeremy L Thompson 24420816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 24430816752eSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 24440816752eSJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 24450816752eSJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 24460816752eSJeremy L Thompson } 24470816752eSJeremy L Thompson } 24480816752eSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 24490816752eSJeremy L Thompson CeedEvalMode eval_mode; 24500816752eSJeremy L Thompson 24510816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 24520816752eSJeremy L Thompson if (eval_mode != CEED_EVAL_NONE) { 24530816752eSJeremy L Thompson CeedInt num_comp; 24540816752eSJeremy L Thompson CeedElemRestriction elem_rstr; 24550816752eSJeremy L Thompson 24560816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 24570816752eSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 24580816752eSJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1)); 24590816752eSJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 24600816752eSJeremy L Thompson } 24610816752eSJeremy L Thompson } 24620816752eSJeremy L Thompson code << tab << "// Scratch restriction buffer space\n"; 24630816752eSJeremy L Thompson code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 24640816752eSJeremy L Thompson 24650816752eSJeremy L Thompson // -- Determine best input field processing order 24660816752eSJeremy L Thompson CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 24670816752eSJeremy L Thompson 24680816752eSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 24690816752eSJeremy L Thompson field_rstr_in_buffer[i] = -1; 24700816752eSJeremy L Thompson input_field_order[i] = -1; 24710816752eSJeremy L Thompson } 24720816752eSJeremy L Thompson { 24730816752eSJeremy L Thompson bool is_ordered[CEED_FIELD_MAX]; 24740816752eSJeremy L Thompson CeedInt curr_index = 0; 24750816752eSJeremy L Thompson 24760816752eSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 24770816752eSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 24780816752eSJeremy L Thompson CeedVector vec_i; 24790816752eSJeremy L Thompson CeedElemRestriction rstr_i; 24800816752eSJeremy L Thompson 24810816752eSJeremy L Thompson if (is_ordered[i]) continue; 24820816752eSJeremy L Thompson field_rstr_in_buffer[i] = i; 24830816752eSJeremy L Thompson is_ordered[i] = true; 24840816752eSJeremy L Thompson input_field_order[curr_index] = i; 24850816752eSJeremy L Thompson curr_index++; 24860816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 24870816752eSJeremy L Thompson if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 24880816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 24890816752eSJeremy L Thompson for (CeedInt j = i + 1; j < num_input_fields; j++) { 24900816752eSJeremy L Thompson CeedVector vec_j; 24910816752eSJeremy L Thompson CeedElemRestriction rstr_j; 24920816752eSJeremy L Thompson 24930816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 24940816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 24950816752eSJeremy L Thompson if (rstr_i == rstr_j && vec_i == vec_j) { 24960816752eSJeremy L Thompson field_rstr_in_buffer[j] = i; 24970816752eSJeremy L Thompson is_ordered[j] = true; 24980816752eSJeremy L Thompson input_field_order[curr_index] = j; 24990816752eSJeremy L Thompson curr_index++; 25000816752eSJeremy L Thompson } 25010816752eSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_j)); 25020816752eSJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 25030816752eSJeremy L Thompson } 25040816752eSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_i)); 25050816752eSJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 25060816752eSJeremy L Thompson } 25070816752eSJeremy L Thompson } 25080816752eSJeremy L Thompson 25090816752eSJeremy L Thompson // -- Input restriction and basis 25100816752eSJeremy L Thompson code << "\n" << tab << "// -- Input field restrictions and basis actions\n"; 25110816752eSJeremy L Thompson CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0; 25120816752eSJeremy L Thompson CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX]; 25130816752eSJeremy L Thompson 25140816752eSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 25150816752eSJeremy L Thompson bool is_active = false; 25160816752eSJeremy L Thompson const char *field_name; 25170816752eSJeremy L Thompson const CeedInt f = input_field_order[i]; 25180816752eSJeremy L Thompson 25190816752eSJeremy L Thompson { 25200816752eSJeremy L Thompson CeedVector vec; 25210816752eSJeremy L Thompson 25220816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec)); 25230816752eSJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 25240816752eSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 25250816752eSJeremy L Thompson } 25260816752eSJeremy L Thompson 25270816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name)); 25280816752eSJeremy L Thompson code << tab << "// ---- Input field " << f << ": " << field_name << "\n"; 25290816752eSJeremy L Thompson 25300816752eSJeremy L Thompson if (is_active) { 25310816752eSJeremy L Thompson CeedEvalMode eval_mode; 25320816752eSJeremy L Thompson CeedInt field_size; 25330816752eSJeremy L Thompson 25340816752eSJeremy L Thompson active_fields_in[num_active_in] = f; 25350816752eSJeremy L Thompson num_active_in++; 25360816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size)); 25370816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode)); 25380816752eSJeremy L Thompson if (eval_mode == CEED_EVAL_GRAD) { 25390816752eSJeremy L Thompson code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*" 25400816752eSJeremy L Thompson << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; 25410816752eSJeremy L Thompson } else { 25420816752eSJeremy L Thompson code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n"; 25430816752eSJeremy L Thompson } 25440816752eSJeremy L Thompson code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n"; 25450816752eSJeremy L Thompson } else { 25460816752eSJeremy L Thompson // ---- Restriction 25470816752eSJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 25480816752eSJeremy L Thompson max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices)); 25490816752eSJeremy L Thompson 25500816752eSJeremy L Thompson // ---- Basis action 25510816752eSJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true, 25520816752eSJeremy L Thompson is_all_tensor, is_at_points, use_3d_slices)); 25530816752eSJeremy L Thompson } 25540816752eSJeremy L Thompson } 25550816752eSJeremy L Thompson code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {"; 25560816752eSJeremy L Thompson for (CeedInt i = 0; i < num_active_in; i++) { 25570816752eSJeremy L Thompson code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); 25580816752eSJeremy L Thompson } 25590816752eSJeremy L Thompson code << "};\n"; 25600816752eSJeremy L Thompson code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {"; 25610816752eSJeremy L Thompson for (CeedInt i = 0; i < num_active_in; i++) { 25620816752eSJeremy L Thompson code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : ""); 25630816752eSJeremy L Thompson } 25640816752eSJeremy L Thompson code << "};\n"; 25650816752eSJeremy L Thompson 25660816752eSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 25670816752eSJeremy L Thompson bool is_active = false; 25680816752eSJeremy L Thompson 25690816752eSJeremy L Thompson { 25700816752eSJeremy L Thompson CeedVector vec; 25710816752eSJeremy L Thompson 25720816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 25730816752eSJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 25740816752eSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 25750816752eSJeremy L Thompson } 25760816752eSJeremy L Thompson if (is_active) { 25770816752eSJeremy L Thompson const char *field_name; 25780816752eSJeremy L Thompson CeedInt field_size; 25790816752eSJeremy L Thompson 25800816752eSJeremy L Thompson active_fields_out[num_active_out] = i; 25810816752eSJeremy L Thompson num_active_out++; 25820816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); 25830816752eSJeremy L Thompson qf_assembly_size_out += field_size; 25840816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 25850816752eSJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 25860816752eSJeremy L Thompson code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n"; 25870816752eSJeremy L Thompson } 25880816752eSJeremy L Thompson } 25890816752eSJeremy L Thompson code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {"; 25900816752eSJeremy L Thompson for (CeedInt i = 0; i < num_active_out; i++) { 25910816752eSJeremy L Thompson code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : ""); 25920816752eSJeremy L Thompson } 25930816752eSJeremy L Thompson code << "};\n"; 25940816752eSJeremy L Thompson code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n"; 25950816752eSJeremy L Thompson 25960816752eSJeremy L Thompson // -- Loop over active field 25970816752eSJeremy L Thompson code << "\n" << tab << "CeedInt input_offset = 0;\n"; 25980816752eSJeremy L Thompson code << tab << "// Loop over active QFunction input fields\n"; 25990816752eSJeremy L Thompson code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n"; 26000816752eSJeremy L Thompson code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n"; 26010816752eSJeremy L Thompson tab.push(); 26020816752eSJeremy L Thompson 26030816752eSJeremy L Thompson // -- Loop over size of active field 26040816752eSJeremy L Thompson code << "\n" << tab << "// Loop over current active input field size\n"; 26050816752eSJeremy L Thompson code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n"; 26060816752eSJeremy L Thompson code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n"; 26070816752eSJeremy L Thompson tab.push(); 26080816752eSJeremy L Thompson 26090816752eSJeremy L Thompson // -- Set current active point and component to 1 26100816752eSJeremy L Thompson code << tab << "// Set current active point and component to 1.0\n"; 26110816752eSJeremy L Thompson if (is_all_tensor && (max_dim >= 3)) { 26120816752eSJeremy L Thompson code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n"; 26130816752eSJeremy L Thompson } else { 26140816752eSJeremy L Thompson code << tab << "r_q_in[a][s] = 1.0;\n"; 26150816752eSJeremy L Thompson } 26160816752eSJeremy L Thompson 26170816752eSJeremy L Thompson // -- Q function 26180816752eSJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields, 26190816752eSJeremy L Thompson qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name, 2620*745f16d1SZach Atkins Q_1d, is_all_tensor, is_at_points, use_3d_slices, true)); 26210816752eSJeremy L Thompson 26220816752eSJeremy L Thompson // -- Output basis and restriction 26230816752eSJeremy L Thompson code << "\n" << tab << "// -- Output field basis action and restrictions\n"; 26240816752eSJeremy L Thompson CeedScalar offset = 0; 26250816752eSJeremy L Thompson 26260816752eSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 26270816752eSJeremy L Thompson bool is_active = false; 26280816752eSJeremy L Thompson const char *field_name; 26290816752eSJeremy L Thompson 26300816752eSJeremy L Thompson { 26310816752eSJeremy L Thompson CeedVector vec; 26320816752eSJeremy L Thompson 26330816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec)); 26340816752eSJeremy L Thompson is_active = vec == CEED_VECTOR_ACTIVE; 26350816752eSJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec)); 26360816752eSJeremy L Thompson } 26370816752eSJeremy L Thompson if (!is_active) continue; 26380816752eSJeremy L Thompson 26390816752eSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name)); 26400816752eSJeremy L Thompson code << tab << "// ---- Output field " << i << ": " << field_name << "\n"; 26410816752eSJeremy L Thompson 26420816752eSJeremy L Thompson // ---- Restriction 26430816752eSJeremy L Thompson CeedInt field_size; 26440816752eSJeremy L Thompson 26450816752eSJeremy L Thompson code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", " 26460816752eSJeremy L Thompson << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n"; 26470816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size)); 26480816752eSJeremy L Thompson offset += field_size; 26490816752eSJeremy L Thompson } 26500816752eSJeremy L Thompson 26510816752eSJeremy L Thompson // -- Reset current active node and component 26520816752eSJeremy L Thompson code << "\n" << tab << "// Reset current active node and component to 0.0\n"; 26530816752eSJeremy L Thompson if (is_all_tensor && (max_dim >= 3)) { 26540816752eSJeremy L Thompson code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n"; 26550816752eSJeremy L Thompson } else { 26560816752eSJeremy L Thompson code << tab << "r_q_in[a][s] = 0.0;\n"; 26570816752eSJeremy L Thompson } 26580816752eSJeremy L Thompson 26590816752eSJeremy L Thompson // -- End of loop over size of active field 26600816752eSJeremy L Thompson tab.pop(); 26610816752eSJeremy L Thompson code << tab << "}\n"; 26620816752eSJeremy L Thompson code << tab << "input_offset += field_size_in;\n"; 26630816752eSJeremy L Thompson 26640816752eSJeremy L Thompson // -- End of loop over active field 26650816752eSJeremy L Thompson tab.pop(); 26660816752eSJeremy L Thompson code << tab << "}\n"; 26670816752eSJeremy L Thompson 26680816752eSJeremy L Thompson // Close loop and function 26690816752eSJeremy L Thompson tab.pop(); 26700816752eSJeremy L Thompson code << tab << "}\n"; 26710816752eSJeremy L Thompson tab.pop(); 26720816752eSJeremy L Thompson code << tab << "}\n"; 26730816752eSJeremy L Thompson code << tab << "// -----------------------------------------------------------------------------\n\n"; 26740816752eSJeremy L Thompson 26750816752eSJeremy L Thompson // Compile 26760816752eSJeremy L Thompson { 26770816752eSJeremy L Thompson bool is_compile_good = false; 26780816752eSJeremy L Thompson const CeedInt T_1d = CeedIntMax(is_all_tensor ? Q_1d : Q, data->max_P_1d); 26790816752eSJeremy L Thompson 26800816752eSJeremy L Thompson data->thread_1d = T_1d; 26810816752eSJeremy L Thompson CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 1, "OP_T_1D", T_1d)); 26820816752eSJeremy L Thompson if (is_compile_good) { 26830816752eSJeremy L Thompson *is_good_build = true; 26840816752eSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction)); 26850816752eSJeremy L Thompson } else { 26860816752eSJeremy L Thompson *is_good_build = false; 26870816752eSJeremy L Thompson data->use_assembly_fallback = true; 26880816752eSJeremy L Thompson } 26890816752eSJeremy L Thompson } 26900816752eSJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 26910816752eSJeremy L Thompson CeedCallBackend(CeedQFunctionDestroy(&qf)); 26920816752eSJeremy L Thompson return CEED_ERROR_SUCCESS; 26930816752eSJeremy L Thompson } 26940816752eSJeremy L Thompson 26950816752eSJeremy L Thompson //------------------------------------------------------------------------------ 2696