15aed82e4SJeremy L Thompson // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3241a4b83SYohann // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5241a4b83SYohann // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 73d576824SJeremy L Thompson 8b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12 97df94212SJeremy L Thompson 1049aac155SJeremy L Thompson #include <ceed.h> 11ec3da8bcSJed Brown #include <ceed/backend.h> 129e201c85SYohann #include <ceed/jit-tools.h> 133d576824SJeremy L Thompson #include <cuda_runtime.h> 142b730f8bSJeremy L Thompson 15241a4b83SYohann #include <iostream> 16241a4b83SYohann #include <sstream> 17b2165e7aSSebastian Grimberg #include <string> 182b730f8bSJeremy L Thompson 190d0321e0SJeremy L Thompson #include "../cuda-ref/ceed-cuda-ref.h" 20241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 2149aac155SJeremy L Thompson #include "../cuda/ceed-cuda-common.h" 222b730f8bSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 232b730f8bSJeremy L Thompson #include "ceed-cuda-gen.h" 24241a4b83SYohann 25ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 264b3e95d5SJeremy L Thompson // Determine type of operator 274b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 284b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields, 294b3e95d5SJeremy L Thompson CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields, 304b3e95d5SJeremy L Thompson CeedQFunctionField *qf_output_fields, CeedInt *max_P_1d, CeedInt *Q_1d, CeedInt *dim, bool *is_tensor, 314b3e95d5SJeremy L Thompson bool *use_3d_slices) { 324b3e95d5SJeremy L Thompson // Find dim, P_1d, Q_1d 334b3e95d5SJeremy L Thompson *max_P_1d = 0; 344b3e95d5SJeremy L Thompson *Q_1d = 0; 354b3e95d5SJeremy L Thompson *dim = 0; 364b3e95d5SJeremy L Thompson *is_tensor = true; 374b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 384b3e95d5SJeremy L Thompson CeedBasis basis; 394b3e95d5SJeremy L Thompson 404b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 414b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 424b3e95d5SJeremy L Thompson bool is_field_tensor; 434b3e95d5SJeremy L Thompson CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0; 444b3e95d5SJeremy L Thompson 454b3e95d5SJeremy L Thompson // Collect dim, P_1d, and Q_1d 464b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 474b3e95d5SJeremy L Thompson CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 484b3e95d5SJeremy L Thompson *is_tensor = *is_tensor && is_field_tensor; 494b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 504b3e95d5SJeremy L Thompson *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 514b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 524b3e95d5SJeremy L Thompson CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 534b3e95d5SJeremy L Thompson *dim = field_dim; 544b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 554b3e95d5SJeremy L Thompson CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 564b3e95d5SJeremy L Thompson *Q_1d = field_Q_1d; 574b3e95d5SJeremy L Thompson } 58681d0ea7SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 594b3e95d5SJeremy L Thompson } 604b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 614b3e95d5SJeremy L Thompson CeedBasis basis; 624b3e95d5SJeremy L Thompson 634b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 644b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 654b3e95d5SJeremy L Thompson bool is_field_tensor; 664b3e95d5SJeremy L Thompson CeedInt field_P_1d = 0, field_Q_1d = 0, field_dim = 0; 674b3e95d5SJeremy L Thompson 684b3e95d5SJeremy L Thompson // Collect dim, P_1d, and Q_1d 694b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor)); 704b3e95d5SJeremy L Thompson CeedCheck(is_field_tensor, ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 714b3e95d5SJeremy L Thompson *is_tensor = *is_tensor && is_field_tensor; 724b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d)); 734b3e95d5SJeremy L Thompson *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d); 744b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetDimension(basis, &field_dim)); 754b3e95d5SJeremy L Thompson CeedCheck(*dim == 0 || field_dim == *dim, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 764b3e95d5SJeremy L Thompson *dim = field_dim; 774b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d)); 784b3e95d5SJeremy L Thompson CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible"); 794b3e95d5SJeremy L Thompson *Q_1d = field_Q_1d; 804b3e95d5SJeremy L Thompson } 81681d0ea7SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 824b3e95d5SJeremy L Thompson } 834b3e95d5SJeremy L Thompson 844b3e95d5SJeremy L Thompson // Only use 3D collocated gradient parallelization strategy when gradient is computed 854b3e95d5SJeremy L Thompson *use_3d_slices = false; 864b3e95d5SJeremy L Thompson if (*dim == 3) { 874b3e95d5SJeremy L Thompson bool was_grad_found = false; 884b3e95d5SJeremy L Thompson 894b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 904b3e95d5SJeremy L Thompson CeedEvalMode eval_mode; 914b3e95d5SJeremy L Thompson 924b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 934b3e95d5SJeremy L Thompson if (eval_mode == CEED_EVAL_GRAD) { 944b3e95d5SJeremy L Thompson CeedBasis_Cuda_shared *basis_data; 954b3e95d5SJeremy L Thompson CeedBasis basis; 964b3e95d5SJeremy L Thompson 974b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); 984b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 994b3e95d5SJeremy L Thompson *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 1004b3e95d5SJeremy L Thompson was_grad_found = true; 101681d0ea7SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 1024b3e95d5SJeremy L Thompson } 1034b3e95d5SJeremy L Thompson } 1044b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 1054b3e95d5SJeremy L Thompson CeedEvalMode eval_mode; 1064b3e95d5SJeremy L Thompson 1074b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 1084b3e95d5SJeremy L Thompson if (eval_mode == CEED_EVAL_GRAD) { 1094b3e95d5SJeremy L Thompson CeedBasis_Cuda_shared *basis_data; 1104b3e95d5SJeremy L Thompson CeedBasis basis; 1114b3e95d5SJeremy L Thompson 1124b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); 1134b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 1144b3e95d5SJeremy L Thompson *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true); 1154b3e95d5SJeremy L Thompson was_grad_found = true; 116681d0ea7SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 1174b3e95d5SJeremy L Thompson } 1184b3e95d5SJeremy L Thompson } 1194b3e95d5SJeremy L Thompson } 1204b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 1214b3e95d5SJeremy L Thompson } 1224b3e95d5SJeremy L Thompson 1234b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 1244b3e95d5SJeremy L Thompson // Setup fields 1254b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 1264b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field, 127*8b97b69aSJeremy L Thompson CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_at_points, 128*8b97b69aSJeremy L Thompson bool use_3d_slices) { 1294b3e95d5SJeremy L Thompson std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 1304b3e95d5SJeremy L Thompson std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d"; 1314b3e95d5SJeremy L Thompson std::string option_name = (is_input ? "inputs" : "outputs"); 1324b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 1334b3e95d5SJeremy L Thompson CeedInt elem_size = 0, num_comp = 0, P_1d = 0; 1344b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 1354b3e95d5SJeremy L Thompson CeedBasis_Cuda_shared *basis_data; 1364b3e95d5SJeremy L Thompson CeedBasis basis; 1374b3e95d5SJeremy L Thompson 1384b3e95d5SJeremy L Thompson code << " // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n"; 1394b3e95d5SJeremy L Thompson 1404b3e95d5SJeremy L Thompson // Get field data 1414b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 1424b3e95d5SJeremy L Thompson if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 1434b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 1444b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1454b3e95d5SJeremy L Thompson } 146681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1474b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 1484b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 1494b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 1504b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 1514b3e95d5SJeremy L Thompson } 1524b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 1534b3e95d5SJeremy L Thompson 1544b3e95d5SJeremy L Thompson // Set field constants 1554b3e95d5SJeremy L Thompson if (eval_mode != CEED_EVAL_WEIGHT) { 1564b3e95d5SJeremy L Thompson code << " const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n"; 1574b3e95d5SJeremy L Thompson code << " const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; 1584b3e95d5SJeremy L Thompson } 1594b3e95d5SJeremy L Thompson 1604b3e95d5SJeremy L Thompson // Load basis data 1614b3e95d5SJeremy L Thompson code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 1624b3e95d5SJeremy L Thompson switch (eval_mode) { 1634b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 1644b3e95d5SJeremy L Thompson break; 1654b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 166*8b97b69aSJeremy L Thompson if (is_at_points) { 167*8b97b69aSJeremy L Thompson // AtPoints 168*8b97b69aSJeremy L Thompson if (!basis_data->d_chebyshev_interp_1d) { 169*8b97b69aSJeremy L Thompson CeedSize interp_bytes; 170*8b97b69aSJeremy L Thompson CeedScalar *chebyshev_interp_1d; 171*8b97b69aSJeremy L Thompson 172*8b97b69aSJeremy L Thompson interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 173*8b97b69aSJeremy L Thompson CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 174*8b97b69aSJeremy L Thompson CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 175*8b97b69aSJeremy L Thompson CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 176*8b97b69aSJeremy L Thompson CeedCallCuda(CeedBasisReturnCeed(basis), 177*8b97b69aSJeremy L Thompson cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 178*8b97b69aSJeremy L Thompson CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 179*8b97b69aSJeremy L Thompson } 180*8b97b69aSJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 181*8b97b69aSJeremy L Thompson else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 182*8b97b69aSJeremy L Thompson } else { 183*8b97b69aSJeremy L Thompson // Standard quadrature 1844b3e95d5SJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 1854b3e95d5SJeremy L Thompson else data->B.outputs[i] = basis_data->d_interp_1d; 186*8b97b69aSJeremy L Thompson } 1874b3e95d5SJeremy L Thompson code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n"; 188f815fac9SJeremy L Thompson code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 1894b3e95d5SJeremy L Thompson break; 1904b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 191*8b97b69aSJeremy L Thompson if (is_at_points) { 192*8b97b69aSJeremy L Thompson // AtPoints 193*8b97b69aSJeremy L Thompson if (!basis_data->d_chebyshev_interp_1d) { 194*8b97b69aSJeremy L Thompson CeedSize interp_bytes; 195*8b97b69aSJeremy L Thompson CeedScalar *chebyshev_interp_1d; 196*8b97b69aSJeremy L Thompson 197*8b97b69aSJeremy L Thompson interp_bytes = P_1d * Q_1d * sizeof(CeedScalar); 198*8b97b69aSJeremy L Thompson CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d)); 199*8b97b69aSJeremy L Thompson CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d)); 200*8b97b69aSJeremy L Thompson CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes)); 201*8b97b69aSJeremy L Thompson CeedCallCuda(CeedBasisReturnCeed(basis), 202*8b97b69aSJeremy L Thompson cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice)); 203*8b97b69aSJeremy L Thompson CeedCallBackend(CeedFree(&chebyshev_interp_1d)); 204*8b97b69aSJeremy L Thompson } 205*8b97b69aSJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d; 206*8b97b69aSJeremy L Thompson else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d; 207*8b97b69aSJeremy L Thompson } else { 208*8b97b69aSJeremy L Thompson // Standard quadrature 2094b3e95d5SJeremy L Thompson if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; 2104b3e95d5SJeremy L Thompson else data->B.outputs[i] = basis_data->d_interp_1d; 211*8b97b69aSJeremy L Thompson } 2124b3e95d5SJeremy L Thompson code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n"; 213f815fac9SJeremy L Thompson code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; 214*8b97b69aSJeremy L Thompson if (is_at_points) break; // No G mat for AtPoints 2154b3e95d5SJeremy L Thompson if (use_3d_slices) { 2164b3e95d5SJeremy L Thompson if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d; 2174b3e95d5SJeremy L Thompson else data->G.outputs[i] = basis_data->d_collo_grad_1d; 2184b3e95d5SJeremy L Thompson code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n"; 219f815fac9SJeremy L Thompson code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 2204b3e95d5SJeremy L Thompson } else { 2214b3e95d5SJeremy L Thompson bool has_collo_grad = basis_data->d_collo_grad_1d; 2224b3e95d5SJeremy L Thompson 2234b3e95d5SJeremy L Thompson if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 2244b3e95d5SJeremy L Thompson else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 2254b3e95d5SJeremy L Thompson if (has_collo_grad) { 2264b3e95d5SJeremy L Thompson code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * Q_1d << "];\n"; 227f815fac9SJeremy L Thompson code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 2284b3e95d5SJeremy L Thompson } else { 2294b3e95d5SJeremy L Thompson code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_1d * P_1d << "];\n"; 230f815fac9SJeremy L Thompson code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; 2314b3e95d5SJeremy L Thompson } 2324b3e95d5SJeremy L Thompson } 2334b3e95d5SJeremy L Thompson break; 2344b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 2354b3e95d5SJeremy L Thompson break; // No action 2364b3e95d5SJeremy L Thompson // LCOV_EXCL_START 2374b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 2384b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 2394b3e95d5SJeremy L Thompson break; // TODO: Not implemented 2404b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 2414b3e95d5SJeremy L Thompson } 242*8b97b69aSJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 2434b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 2444b3e95d5SJeremy L Thompson } 2454b3e95d5SJeremy L Thompson 2464b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 2474b3e95d5SJeremy L Thompson // Restriction 2484b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 2494b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim, 250e93651e5SJeremy L Thompson CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field, 251*8b97b69aSJeremy L Thompson CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) { 2524b3e95d5SJeremy L Thompson std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 2534b3e95d5SJeremy L Thompson std::string P_name = "P_1d" + var_suffix; 2544b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 2554b3e95d5SJeremy L Thompson CeedInt elem_size = 0, num_comp = 0, P_1d = 0; 2564b3e95d5SJeremy L Thompson CeedSize l_size; 257f815fac9SJeremy L Thompson CeedRestrictionType rstr_type = CEED_RESTRICTION_STANDARD; 2584b3e95d5SJeremy L Thompson CeedElemRestriction_Cuda *rstr_data; 2594b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 2604b3e95d5SJeremy L Thompson CeedBasis basis; 2614b3e95d5SJeremy L Thompson 2624b3e95d5SJeremy L Thompson // Get field data 2634b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 2644b3e95d5SJeremy L Thompson if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 265f815fac9SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type)); 2664b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 2674b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 2684b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 2694b3e95d5SJeremy L Thompson } 2704b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 2714b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 2724b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 2734b3e95d5SJeremy L Thompson } 274681d0ea7SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 2754b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 2764b3e95d5SJeremy L Thompson 2774b3e95d5SJeremy L Thompson // Restriction 2784b3e95d5SJeremy L Thompson if (is_input) { 2794b3e95d5SJeremy L Thompson // Input 280e93651e5SJeremy L Thompson if (field_input_buffer[i] != i) { 281e93651e5SJeremy L Thompson std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]); 282e93651e5SJeremy L Thompson 283e93651e5SJeremy L Thompson // Restriction was already done for previous input 284e93651e5SJeremy L Thompson code << " CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n"; 285*8b97b69aSJeremy L Thompson } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) { 286*8b97b69aSJeremy L Thompson if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) { 287e93651e5SJeremy L Thompson // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated 2884b3e95d5SJeremy L Thompson code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n"; 289*8b97b69aSJeremy L Thompson } else if (rstr_type != CEED_RESTRICTION_POINTS) { 290e93651e5SJeremy L Thompson // Otherwise we're using the scratch space 291e93651e5SJeremy L Thompson code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 292e93651e5SJeremy L Thompson } 293f815fac9SJeremy L Thompson switch (rstr_type) { 294f815fac9SJeremy L Thompson case CEED_RESTRICTION_STANDARD: { 2954b3e95d5SJeremy L Thompson CeedInt comp_stride; 2964b3e95d5SJeremy L Thompson 2974b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 2984b3e95d5SJeremy L Thompson code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 2994b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 3004b3e95d5SJeremy L Thompson code << " // CompStride: " << comp_stride << "\n"; 3014b3e95d5SJeremy L Thompson data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 302f815fac9SJeremy L Thompson code << " ReadLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" 303f815fac9SJeremy L Thompson << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n"; 304f815fac9SJeremy L Thompson break; 305f815fac9SJeremy L Thompson } 306f815fac9SJeremy L Thompson case CEED_RESTRICTION_STRIDED: { 3074b3e95d5SJeremy L Thompson bool has_backend_strides; 3084b3e95d5SJeremy L Thompson CeedInt num_elem; 3094b3e95d5SJeremy L Thompson 3104b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 3114b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 3124b3e95d5SJeremy L Thompson CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 3134b3e95d5SJeremy L Thompson 3144b3e95d5SJeremy L Thompson if (!has_backend_strides) { 3154b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 3164b3e95d5SJeremy L Thompson } 3174b3e95d5SJeremy L Thompson code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 318f815fac9SJeremy L Thompson code << " ReadLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << "," 3194b3e95d5SJeremy L Thompson << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n"; 320f815fac9SJeremy L Thompson break; 321f815fac9SJeremy L Thompson } 322*8b97b69aSJeremy L Thompson case CEED_RESTRICTION_POINTS: { 323*8b97b69aSJeremy L Thompson CeedInt comp_stride; 324*8b97b69aSJeremy L Thompson 325*8b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 326*8b97b69aSJeremy L Thompson code << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 327*8b97b69aSJeremy L Thompson data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 328*8b97b69aSJeremy L Thompson break; 329*8b97b69aSJeremy L Thompson } 330f815fac9SJeremy L Thompson // LCOV_EXCL_START 331f815fac9SJeremy L Thompson case CEED_RESTRICTION_ORIENTED: 332f815fac9SJeremy L Thompson case CEED_RESTRICTION_CURL_ORIENTED: 333f815fac9SJeremy L Thompson break; // TODO: Not implemented 334f815fac9SJeremy L Thompson // LCOV_EXCL_STOP 3354b3e95d5SJeremy L Thompson } 3364b3e95d5SJeremy L Thompson } 3374b3e95d5SJeremy L Thompson } else { 3384b3e95d5SJeremy L Thompson // Output 339f815fac9SJeremy L Thompson switch (rstr_type) { 340f815fac9SJeremy L Thompson case CEED_RESTRICTION_STANDARD: { 3414b3e95d5SJeremy L Thompson CeedInt comp_stride; 3424b3e95d5SJeremy L Thompson 3434b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 3444b3e95d5SJeremy L Thompson code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 3454b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 3464b3e95d5SJeremy L Thompson code << " // CompStride: " << comp_stride << "\n"; 3474b3e95d5SJeremy L Thompson data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 348f815fac9SJeremy L Thompson code << " WriteLVecStandard" << dim << "d<num_comp" << var_suffix << ", " << comp_stride << ", " << P_name << ">(data, l_size" 349f815fac9SJeremy L Thompson << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix << ");\n"; 350f815fac9SJeremy L Thompson break; 351f815fac9SJeremy L Thompson } 352f815fac9SJeremy L Thompson case CEED_RESTRICTION_STRIDED: { 3534b3e95d5SJeremy L Thompson bool has_backend_strides; 3544b3e95d5SJeremy L Thompson CeedInt num_elem; 3554b3e95d5SJeremy L Thompson 3564b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 3574b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 3584b3e95d5SJeremy L Thompson CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 3594b3e95d5SJeremy L Thompson 3604b3e95d5SJeremy L Thompson if (!has_backend_strides) { 3614b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 3624b3e95d5SJeremy L Thompson } 3634b3e95d5SJeremy L Thompson code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 364f815fac9SJeremy L Thompson code << " WriteLVecStrided" << dim << "d<num_comp" << var_suffix << ", " << P_name << "," << strides[0] << "," << strides[1] << "," 3654b3e95d5SJeremy L Thompson << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n"; 366f815fac9SJeremy L Thompson break; 367f815fac9SJeremy L Thompson } 368*8b97b69aSJeremy L Thompson case CEED_RESTRICTION_POINTS: 369*8b97b69aSJeremy L Thompson data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; 370*8b97b69aSJeremy L Thompson break; 371f815fac9SJeremy L Thompson // LCOV_EXCL_START 372f815fac9SJeremy L Thompson case CEED_RESTRICTION_ORIENTED: 373f815fac9SJeremy L Thompson case CEED_RESTRICTION_CURL_ORIENTED: 374f815fac9SJeremy L Thompson break; // TODO: Not implemented 375f815fac9SJeremy L Thompson // LCOV_EXCL_STOP 3764b3e95d5SJeremy L Thompson } 3774b3e95d5SJeremy L Thompson } 378681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 3794b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 3804b3e95d5SJeremy L Thompson } 3814b3e95d5SJeremy L Thompson 3824b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 3834b3e95d5SJeremy L Thompson // Basis 3844b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 3854b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim, 3864b3e95d5SJeremy L Thompson CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, 387*8b97b69aSJeremy L Thompson bool is_at_points, bool use_3d_slices) { 3884b3e95d5SJeremy L Thompson std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); 3894b3e95d5SJeremy L Thompson std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d"; 3904b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 3914b3e95d5SJeremy L Thompson CeedInt elem_size = 0, num_comp = 0, P_1d = 0; 3924b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 3934b3e95d5SJeremy L Thompson CeedBasis basis; 3944b3e95d5SJeremy L Thompson 3954b3e95d5SJeremy L Thompson // Get field data 3964b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr)); 3974b3e95d5SJeremy L Thompson if (elem_rstr != CEED_ELEMRESTRICTION_NONE) { 3984b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 3994b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 4004b3e95d5SJeremy L Thompson } 401681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 4024b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis)); 4034b3e95d5SJeremy L Thompson if (basis != CEED_BASIS_NONE) { 4044b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); 4054b3e95d5SJeremy L Thompson } 4064b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode)); 4074b3e95d5SJeremy L Thompson 4084b3e95d5SJeremy L Thompson // Basis 4094b3e95d5SJeremy L Thompson code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 4104b3e95d5SJeremy L Thompson if (is_input) { 4114b3e95d5SJeremy L Thompson switch (eval_mode) { 4124b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 413*8b97b69aSJeremy L Thompson if (!use_3d_slices && !is_at_points) { 4144b3e95d5SJeremy L Thompson code << " CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n"; 4154b3e95d5SJeremy L Thompson } 4164b3e95d5SJeremy L Thompson break; 4174b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 418*8b97b69aSJeremy L Thompson if (is_at_points) { 419*8b97b69aSJeremy L Thompson code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n"; 420*8b97b69aSJeremy L Thompson code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name 421*8b97b69aSJeremy L Thompson << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 422*8b97b69aSJeremy L Thompson } else { 4234b3e95d5SJeremy L Thompson code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 4244b3e95d5SJeremy L Thompson code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name 4254b3e95d5SJeremy L Thompson << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 426*8b97b69aSJeremy L Thompson } 4274b3e95d5SJeremy L Thompson break; 4284b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 429*8b97b69aSJeremy L Thompson if (is_at_points) { 430*8b97b69aSJeremy L Thompson code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n"; 431*8b97b69aSJeremy L Thompson code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name 432*8b97b69aSJeremy L Thompson << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n"; 433*8b97b69aSJeremy L Thompson } else if (use_3d_slices) { 4344b3e95d5SJeremy L Thompson code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 4354b3e95d5SJeremy L Thompson code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", P_1d" << var_suffix << ", " << Q_name 4364b3e95d5SJeremy L Thompson << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; 4374b3e95d5SJeremy L Thompson } else { 4384b3e95d5SJeremy L Thompson code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n"; 4394b3e95d5SJeremy L Thompson code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp" << var_suffix 4404b3e95d5SJeremy L Thompson << ", P_1d" << var_suffix << ", " << Q_name << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" 4414b3e95d5SJeremy L Thompson << var_suffix << ");\n"; 4424b3e95d5SJeremy L Thompson } 4434b3e95d5SJeremy L Thompson break; 4444b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: { 445*8b97b69aSJeremy L Thompson if (is_at_points) { 446*8b97b69aSJeremy L Thompson code << " // Nothing to do AtPoints\n"; 447*8b97b69aSJeremy L Thompson } else { 4484b3e95d5SJeremy L Thompson CeedBasis_Cuda_shared *basis_data; 4494b3e95d5SJeremy L Thompson 4504b3e95d5SJeremy L Thompson code << " CeedScalar r_q" << var_suffix << "[" << Q_name << "];\n"; 4514b3e95d5SJeremy L Thompson CeedCallBackend(CeedBasisGetData(basis, &basis_data)); 4524b3e95d5SJeremy L Thompson data->W = basis_data->d_q_weight_1d; 4534b3e95d5SJeremy L Thompson code << " Weight" << (dim > 1 ? "Tensor" : "") << dim << "d<" << Q_name << ">(data, W, r_q" << var_suffix << ");\n"; 454*8b97b69aSJeremy L Thompson } 4554b3e95d5SJeremy L Thompson break; 4564b3e95d5SJeremy L Thompson } 4574b3e95d5SJeremy L Thompson // LCOV_EXCL_START 4584b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 4594b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 4604b3e95d5SJeremy L Thompson break; // TODO: Not implemented 4614b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 4624b3e95d5SJeremy L Thompson } 4634b3e95d5SJeremy L Thompson } else { 4644b3e95d5SJeremy L Thompson switch (eval_mode) { 4654b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 4664b3e95d5SJeremy L Thompson code << " CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n"; 4674b3e95d5SJeremy L Thompson break; // No action 4684b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 469e93651e5SJeremy L Thompson code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 470*8b97b69aSJeremy L Thompson if (is_at_points) { 471*8b97b69aSJeremy L Thompson code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name 472*8b97b69aSJeremy L Thompson << ">(data, r_c" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 473*8b97b69aSJeremy L Thompson } else { 4744b3e95d5SJeremy L Thompson code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name 4754b3e95d5SJeremy L Thompson << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 476*8b97b69aSJeremy L Thompson } 4774b3e95d5SJeremy L Thompson break; 4784b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 479e93651e5SJeremy L Thompson code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n"; 480*8b97b69aSJeremy L Thompson if (is_at_points) { 481*8b97b69aSJeremy L Thompson code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name 482*8b97b69aSJeremy L Thompson << ">(data, r_c" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 483*8b97b69aSJeremy L Thompson } else if (use_3d_slices) { 4844b3e95d5SJeremy L Thompson code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name 4854b3e95d5SJeremy L Thompson << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n"; 4864b3e95d5SJeremy L Thompson } else { 4874b3e95d5SJeremy L Thompson code << " GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d<num_comp" 4884b3e95d5SJeremy L Thompson << var_suffix << ", " << P_name << "," << Q_name << ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix 4894b3e95d5SJeremy L Thompson << ", r_e" << var_suffix << ");\n"; 4904b3e95d5SJeremy L Thompson } 4914b3e95d5SJeremy L Thompson break; 4924b3e95d5SJeremy L Thompson // LCOV_EXCL_START 4934b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 4944b3e95d5SJeremy L Thompson break; // Should not occur 4954b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 4964b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 4974b3e95d5SJeremy L Thompson break; // TODO: Not implemented 4984b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 4994b3e95d5SJeremy L Thompson } 5004b3e95d5SJeremy L Thompson } 501681d0ea7SJeremy L Thompson CeedCallBackend(CeedBasisDestroy(&basis)); 5024b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 5034b3e95d5SJeremy L Thompson } 5044b3e95d5SJeremy L Thompson 5054b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 5064b3e95d5SJeremy L Thompson // QFunction 5074b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 508*8b97b69aSJeremy L Thompson static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt dim, CeedInt max_num_points, 509*8b97b69aSJeremy L Thompson CeedInt num_input_fields, CeedOperatorField *op_input_fields, 510*8b97b69aSJeremy L Thompson CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, 511*8b97b69aSJeremy L Thompson CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields, 512*8b97b69aSJeremy L Thompson std::string qfunction_name, CeedInt Q_1d, bool is_at_points, bool use_3d_slices) { 5134b3e95d5SJeremy L Thompson std::string Q_name = "Q_1d"; 5144b3e95d5SJeremy L Thompson CeedEvalMode eval_mode = CEED_EVAL_NONE; 5154b3e95d5SJeremy L Thompson CeedElemRestriction elem_rstr; 5164b3e95d5SJeremy L Thompson 517*8b97b69aSJeremy L Thompson // Setup output arrays 5184b3e95d5SJeremy L Thompson code << "\n // -- Output field setup\n"; 5194b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 5204b3e95d5SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 5214b3e95d5SJeremy L Thompson 5224b3e95d5SJeremy L Thompson code << " // ---- Output field " << i << "\n"; 5234b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 524*8b97b69aSJeremy L Thompson switch (eval_mode) { 525*8b97b69aSJeremy L Thompson case CEED_EVAL_NONE: 526*8b97b69aSJeremy L Thompson if (is_at_points) { 527*8b97b69aSJeremy L Thompson code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n"; 528*8b97b69aSJeremy L Thompson } else { 5294b3e95d5SJeremy L Thompson code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 5304b3e95d5SJeremy L Thompson } 531*8b97b69aSJeremy L Thompson break; 532*8b97b69aSJeremy L Thompson case CEED_EVAL_INTERP: 533*8b97b69aSJeremy L Thompson if (is_at_points) { 534*8b97b69aSJeremy L Thompson // Accumulator for point data 535*8b97b69aSJeremy L Thompson code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "];\n"; 536*8b97b69aSJeremy L Thompson code << " for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "; i++) {\n"; 537*8b97b69aSJeremy L Thompson code << " r_c" << var_suffix << "[i] = 0.0;\n"; 538*8b97b69aSJeremy L Thompson code << " }\n"; 539*8b97b69aSJeremy L Thompson } else { 540*8b97b69aSJeremy L Thompson code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 541*8b97b69aSJeremy L Thompson } 542*8b97b69aSJeremy L Thompson break; 543*8b97b69aSJeremy L Thompson case CEED_EVAL_GRAD: 544*8b97b69aSJeremy L Thompson if (is_at_points) { 545*8b97b69aSJeremy L Thompson // Accumulator for point data 546*8b97b69aSJeremy L Thompson code << " CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "*dim];\n"; 547*8b97b69aSJeremy L Thompson code << " for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (dim > 2 ? Q_name : "1") << "; i++) {\n"; 548*8b97b69aSJeremy L Thompson code << " r_c" << var_suffix << "[i] = 0.0;\n"; 549*8b97b69aSJeremy L Thompson code << " }\n"; 550*8b97b69aSJeremy L Thompson } else if (use_3d_slices) { 5514b3e95d5SJeremy L Thompson // Accumulator for gradient slices 5524b3e95d5SJeremy L Thompson code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; 5534b3e95d5SJeremy L Thompson code << " for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) {\n"; 5544b3e95d5SJeremy L Thompson code << " r_q" << var_suffix << "[i] = 0.0;\n"; 5554b3e95d5SJeremy L Thompson code << " }\n"; 5564b3e95d5SJeremy L Thompson } else { 5574b3e95d5SJeremy L Thompson code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n"; 5584b3e95d5SJeremy L Thompson } 559*8b97b69aSJeremy L Thompson break; 560*8b97b69aSJeremy L Thompson case CEED_EVAL_WEIGHT: 561*8b97b69aSJeremy L Thompson break; 562*8b97b69aSJeremy L Thompson // LCOV_EXCL_START 563*8b97b69aSJeremy L Thompson case CEED_EVAL_DIV: 564*8b97b69aSJeremy L Thompson case CEED_EVAL_CURL: 565*8b97b69aSJeremy L Thompson break; // TODO: Not implemented 566*8b97b69aSJeremy L Thompson // LCOV_EXCL_STOP 5674b3e95d5SJeremy L Thompson } 5684b3e95d5SJeremy L Thompson } 5694b3e95d5SJeremy L Thompson 570*8b97b69aSJeremy L Thompson if (is_at_points) { 571*8b97b69aSJeremy L Thompson // We need to handle batches of points 572*8b97b69aSJeremy L Thompson code << "\n // Note: Using batches of points\n"; 573*8b97b69aSJeremy L Thompson code << " const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * max_num_points / (blockDim.x * blockDim.y));\n\n"; 574*8b97b69aSJeremy L Thompson code << " #pragma unroll\n"; 575*8b97b69aSJeremy L Thompson code << " for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {\n"; 576*8b97b69aSJeremy L Thompson code << " const CeedInt p = i % max_num_points;\n\n"; 577*8b97b69aSJeremy L Thompson 578*8b97b69aSJeremy L Thompson code << " // -- Coordinates\n"; 579*8b97b69aSJeremy L Thompson code << " CeedScalar r_x[dim];\n"; 580*8b97b69aSJeremy L Thompson code << " ReadPoint<dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n"; 581*8b97b69aSJeremy L Thompson 582*8b97b69aSJeremy L Thompson code << " // -- Input fields\n"; 583*8b97b69aSJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 584*8b97b69aSJeremy L Thompson std::string var_suffix = "_in_" + std::to_string(i); 585*8b97b69aSJeremy L Thompson 586*8b97b69aSJeremy L Thompson code << " // ---- Input field " << i << "\n"; 587*8b97b69aSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 588*8b97b69aSJeremy L Thompson // Basis action 589*8b97b69aSJeremy L Thompson code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 590*8b97b69aSJeremy L Thompson switch (eval_mode) { 591*8b97b69aSJeremy L Thompson case CEED_EVAL_NONE: 592*8b97b69aSJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 593*8b97b69aSJeremy L Thompson code << " ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 594*8b97b69aSJeremy L Thompson << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 595*8b97b69aSJeremy L Thompson break; 596*8b97b69aSJeremy L Thompson case CEED_EVAL_INTERP: 597*8b97b69aSJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 598*8b97b69aSJeremy L Thompson code << " InterpAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix 599*8b97b69aSJeremy L Thompson << ", r_x, r_s" << var_suffix << ");\n"; 600*8b97b69aSJeremy L Thompson break; 601*8b97b69aSJeremy L Thompson case CEED_EVAL_GRAD: 602*8b97b69aSJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 603*8b97b69aSJeremy L Thompson code << " GradAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_c" << var_suffix 604*8b97b69aSJeremy L Thompson << ", r_x, r_s" << var_suffix << ");\n"; 605*8b97b69aSJeremy L Thompson break; 606*8b97b69aSJeremy L Thompson case CEED_EVAL_WEIGHT: 607*8b97b69aSJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[1];\n"; 608*8b97b69aSJeremy L Thompson code << " r_s" << var_suffix << "[0] = 1.0;\n"; 609*8b97b69aSJeremy L Thompson break; 610*8b97b69aSJeremy L Thompson // LCOV_EXCL_START 611*8b97b69aSJeremy L Thompson case CEED_EVAL_DIV: 612*8b97b69aSJeremy L Thompson case CEED_EVAL_CURL: 613*8b97b69aSJeremy L Thompson break; // TODO: Not implemented 614*8b97b69aSJeremy L Thompson // LCOV_EXCL_STOP 615*8b97b69aSJeremy L Thompson } 616*8b97b69aSJeremy L Thompson } 617*8b97b69aSJeremy L Thompson code << "\n // -- Output fields\n"; 618*8b97b69aSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 619*8b97b69aSJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 620*8b97b69aSJeremy L Thompson 621*8b97b69aSJeremy L Thompson code << " // ---- Output field " << i << "\n"; 622*8b97b69aSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 623*8b97b69aSJeremy L Thompson // Basis action 624*8b97b69aSJeremy L Thompson switch (eval_mode) { 625*8b97b69aSJeremy L Thompson case CEED_EVAL_NONE: 626*8b97b69aSJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 627*8b97b69aSJeremy L Thompson break; 628*8b97b69aSJeremy L Thompson case CEED_EVAL_INTERP: 629*8b97b69aSJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 630*8b97b69aSJeremy L Thompson break; 631*8b97b69aSJeremy L Thompson case CEED_EVAL_GRAD: 632*8b97b69aSJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 633*8b97b69aSJeremy L Thompson break; 634*8b97b69aSJeremy L Thompson // LCOV_EXCL_START 635*8b97b69aSJeremy L Thompson case CEED_EVAL_WEIGHT: 636*8b97b69aSJeremy L Thompson break; // Should not occur 637*8b97b69aSJeremy L Thompson case CEED_EVAL_DIV: 638*8b97b69aSJeremy L Thompson case CEED_EVAL_CURL: 639*8b97b69aSJeremy L Thompson break; // TODO: Not implemented 640*8b97b69aSJeremy L Thompson // LCOV_EXCL_STOP 641*8b97b69aSJeremy L Thompson } 642*8b97b69aSJeremy L Thompson } 643*8b97b69aSJeremy L Thompson 644*8b97b69aSJeremy L Thompson } else if (use_3d_slices) { 6454b3e95d5SJeremy L Thompson // We treat quadrature points per slice in 3d to save registers 6464b3e95d5SJeremy L Thompson code << "\n // Note: Using planes of 3D elements\n"; 6474b3e95d5SJeremy L Thompson code << " #pragma unroll\n"; 6484b3e95d5SJeremy L Thompson code << " for (CeedInt q = 0; q < " << Q_name << "; q++) {\n"; 6494b3e95d5SJeremy L Thompson code << " // -- Input fields\n"; 6504b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 6514b3e95d5SJeremy L Thompson std::string var_suffix = "_in_" + std::to_string(i); 6524b3e95d5SJeremy L Thompson 6534b3e95d5SJeremy L Thompson code << " // ---- Input field " << i << "\n"; 6544b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 6554b3e95d5SJeremy L Thompson // Basis action 6564b3e95d5SJeremy L Thompson code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 6574b3e95d5SJeremy L Thompson switch (eval_mode) { 6584b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 6594b3e95d5SJeremy L Thompson bool is_strided; 6604b3e95d5SJeremy L Thompson 6614b3e95d5SJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 6624b3e95d5SJeremy L Thompson 6634b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 6644b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided)); 6654b3e95d5SJeremy L Thompson if (is_strided) { 6664b3e95d5SJeremy L Thompson bool has_backend_strides; 6674b3e95d5SJeremy L Thompson CeedInt num_elem, elem_size; 6684b3e95d5SJeremy L Thompson 6694b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 6704b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides)); 6714b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem)); 6724b3e95d5SJeremy L Thompson CeedInt strides[3] = {1, elem_size * num_elem, elem_size}; 6734b3e95d5SJeremy L Thompson 6744b3e95d5SJeremy L Thompson if (!has_backend_strides) { 6754b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides)); 6764b3e95d5SJeremy L Thompson } 6774b3e95d5SJeremy L Thompson code << " // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n"; 678f815fac9SJeremy L Thompson code << " ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << "," << strides[0] << "," << strides[1] << "," 6794b3e95d5SJeremy L Thompson << strides[2] << ">(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n"; 6804b3e95d5SJeremy L Thompson } else { 6814b3e95d5SJeremy L Thompson CeedSize l_size = 0; 6824b3e95d5SJeremy L Thompson CeedInt comp_stride; 6834b3e95d5SJeremy L Thompson CeedElemRestriction_Cuda *rstr_data; 6844b3e95d5SJeremy L Thompson 6854b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); 6864b3e95d5SJeremy L Thompson code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; 6874b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 6884b3e95d5SJeremy L Thompson code << " // CompStride: " << comp_stride << "\n"; 6894b3e95d5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data)); 6904b3e95d5SJeremy L Thompson data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; 691f815fac9SJeremy L Thompson code << " ReadEVecSliceStandard3d<num_comp" << var_suffix << ", " << comp_stride << ", " << Q_name << ">(data, l_size" << var_suffix 6924b3e95d5SJeremy L Thompson << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n"; 6934b3e95d5SJeremy L Thompson } 694681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 6954b3e95d5SJeremy L Thompson break; 6964b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 6974b3e95d5SJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 6984b3e95d5SJeremy L Thompson code << " for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n"; 6994b3e95d5SJeremy L Thompson code << " r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n"; 7004b3e95d5SJeremy L Thompson code << " }\n"; 7014b3e95d5SJeremy L Thompson break; 7024b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 7034b3e95d5SJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 704f815fac9SJeremy L Thompson code << " GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_q" << var_suffix << ", s_G" << var_suffix 705f815fac9SJeremy L Thompson << ", r_s" << var_suffix << ");\n"; 7064b3e95d5SJeremy L Thompson break; 7074b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 7084b3e95d5SJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[1];\n"; 7094b3e95d5SJeremy L Thompson code << " r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n"; 710*8b97b69aSJeremy L Thompson break; 7114b3e95d5SJeremy L Thompson // LCOV_EXCL_START 7124b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 7134b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 7144b3e95d5SJeremy L Thompson break; // TODO: Not implemented 7154b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 7164b3e95d5SJeremy L Thompson } 7174b3e95d5SJeremy L Thompson } 7184b3e95d5SJeremy L Thompson code << "\n // -- Output fields\n"; 7194b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 7204b3e95d5SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 7214b3e95d5SJeremy L Thompson 7224b3e95d5SJeremy L Thompson code << " // ---- Output field " << i << "\n"; 7234b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 7244b3e95d5SJeremy L Thompson // Basis action 7254b3e95d5SJeremy L Thompson switch (eval_mode) { 7264b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 7274b3e95d5SJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 728*8b97b69aSJeremy L Thompson break; 7294b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 7304b3e95d5SJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n"; 7314b3e95d5SJeremy L Thompson break; 7324b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 7334b3e95d5SJeremy L Thompson code << " CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim];\n"; 7344b3e95d5SJeremy L Thompson break; 7354b3e95d5SJeremy L Thompson // LCOV_EXCL_START 7364b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 7374b3e95d5SJeremy L Thompson break; // Should not occur 7384b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 7394b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 7404b3e95d5SJeremy L Thompson break; // TODO: Not implemented 7414b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 7424b3e95d5SJeremy L Thompson } 7434b3e95d5SJeremy L Thompson } 7444b3e95d5SJeremy L Thompson } else { 7454b3e95d5SJeremy L Thompson code << "\n // Note: Using full elements\n"; 7464b3e95d5SJeremy L Thompson code << " {\n"; 7474b3e95d5SJeremy L Thompson code << " // -- Input fields\n"; 7484b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 7494b3e95d5SJeremy L Thompson code << " // ---- Input field " << i << "\n"; 7504b3e95d5SJeremy L Thompson code << " CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n"; 7514b3e95d5SJeremy L Thompson } 7524b3e95d5SJeremy L Thompson code << " // -- Output fields\n"; 7534b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 7544b3e95d5SJeremy L Thompson code << " // ---- Output field " << i << "\n"; 7554b3e95d5SJeremy L Thompson code << " CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n"; 7564b3e95d5SJeremy L Thompson } 7574b3e95d5SJeremy L Thompson } 7584b3e95d5SJeremy L Thompson 7594b3e95d5SJeremy L Thompson // Input and output buffers 7604b3e95d5SJeremy L Thompson code << "\n // -- QFunction inputs and outputs\n"; 7614b3e95d5SJeremy L Thompson code << " // ---- Inputs\n"; 7624b3e95d5SJeremy L Thompson code << " CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n"; 7634b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 7644b3e95d5SJeremy L Thompson code << " // ------ Input field " << i << "\n"; 7654b3e95d5SJeremy L Thompson code << " inputs[" << i << "] = r_s_in_" << i << ";\n"; 7664b3e95d5SJeremy L Thompson } 7674b3e95d5SJeremy L Thompson code << " // ---- Outputs\n"; 7684b3e95d5SJeremy L Thompson code << " CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n"; 7694b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 7704b3e95d5SJeremy L Thompson code << " // ------ Output field " << i << "\n"; 7714b3e95d5SJeremy L Thompson code << " outputs[" << i << "] = r_s_out_" << i << ";\n"; 7724b3e95d5SJeremy L Thompson } 7734b3e95d5SJeremy L Thompson 7744b3e95d5SJeremy L Thompson // Apply QFunction 7754b3e95d5SJeremy L Thompson code << "\n // -- Apply QFunction\n"; 7764b3e95d5SJeremy L Thompson code << " " << qfunction_name << "(ctx, "; 777*8b97b69aSJeremy L Thompson if (dim != 3 || is_at_points || use_3d_slices) { 7784b3e95d5SJeremy L Thompson code << "1"; 7794b3e95d5SJeremy L Thompson } else { 7804b3e95d5SJeremy L Thompson code << "Q_1d"; 7814b3e95d5SJeremy L Thompson } 7824b3e95d5SJeremy L Thompson code << ", inputs, outputs);\n"; 7834b3e95d5SJeremy L Thompson 784*8b97b69aSJeremy L Thompson if (is_at_points) { 785*8b97b69aSJeremy L Thompson // Map back to coefficients 786*8b97b69aSJeremy L Thompson code << "\n // -- Output fields\n"; 787*8b97b69aSJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 788*8b97b69aSJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 789*8b97b69aSJeremy L Thompson std::string P_name = "P_1d" + var_suffix; 790*8b97b69aSJeremy L Thompson 791*8b97b69aSJeremy L Thompson code << " // ---- Output field " << i << "\n"; 792*8b97b69aSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 793*8b97b69aSJeremy L Thompson // Basis action 794*8b97b69aSJeremy L Thompson code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 795*8b97b69aSJeremy L Thompson switch (eval_mode) { 796*8b97b69aSJeremy L Thompson case CEED_EVAL_NONE: { 797*8b97b69aSJeremy L Thompson CeedInt comp_stride; 798*8b97b69aSJeremy L Thompson CeedElemRestriction elem_rstr; 799*8b97b69aSJeremy L Thompson 800*8b97b69aSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 801*8b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); 802*8b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 803*8b97b69aSJeremy L Thompson code << " const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n"; 804*8b97b69aSJeremy L Thompson code << " WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix 805*8b97b69aSJeremy L Thompson << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]" 806*8b97b69aSJeremy L Thompson << ", r_s" << var_suffix << ", d" << var_suffix << ");\n"; 807*8b97b69aSJeremy L Thompson break; 808*8b97b69aSJeremy L Thompson } 809*8b97b69aSJeremy L Thompson case CEED_EVAL_INTERP: 810*8b97b69aSJeremy L Thompson code << " if (i >= points.num_per_elem[elem]) {\n"; 811*8b97b69aSJeremy L Thompson code << " for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 812*8b97b69aSJeremy L Thompson code << " }\n"; 813*8b97b69aSJeremy L Thompson code << " InterpTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s" 814*8b97b69aSJeremy L Thompson << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 815*8b97b69aSJeremy L Thompson break; 816*8b97b69aSJeremy L Thompson case CEED_EVAL_GRAD: 817*8b97b69aSJeremy L Thompson code << " if (i >= points.num_per_elem[elem]) {\n"; 818*8b97b69aSJeremy L Thompson code << " for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim; j++) r_s" << var_suffix << "[j] = 0.0;\n"; 819*8b97b69aSJeremy L Thompson code << " }\n"; 820*8b97b69aSJeremy L Thompson code << " GradTransposeAtPoints" << dim << "d<num_comp" << var_suffix << ", max_num_points, " << Q_name << ">(data, i, r_s" 821*8b97b69aSJeremy L Thompson << var_suffix << ", r_x, r_c" << var_suffix << ");\n"; 822*8b97b69aSJeremy L Thompson break; 823*8b97b69aSJeremy L Thompson // LCOV_EXCL_START 824*8b97b69aSJeremy L Thompson case CEED_EVAL_WEIGHT: 825*8b97b69aSJeremy L Thompson break; // Should not occur 826*8b97b69aSJeremy L Thompson case CEED_EVAL_DIV: 827*8b97b69aSJeremy L Thompson case CEED_EVAL_CURL: 828*8b97b69aSJeremy L Thompson break; // TODO: Not implemented 829*8b97b69aSJeremy L Thompson // LCOV_EXCL_STOP 830*8b97b69aSJeremy L Thompson } 831*8b97b69aSJeremy L Thompson } 832*8b97b69aSJeremy L Thompson } else if (use_3d_slices) { 8334b3e95d5SJeremy L Thompson // Copy or apply transpose grad, if needed 834*8b97b69aSJeremy L Thompson code << "\n // -- Output fields\n"; 8354b3e95d5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 8364b3e95d5SJeremy L Thompson std::string var_suffix = "_out_" + std::to_string(i); 8374b3e95d5SJeremy L Thompson std::string P_name = "P_1d" + var_suffix; 8384b3e95d5SJeremy L Thompson 8394b3e95d5SJeremy L Thompson code << " // ---- Output field " << i << "\n"; 8404b3e95d5SJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode)); 8414b3e95d5SJeremy L Thompson // Basis action 8424b3e95d5SJeremy L Thompson code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; 8434b3e95d5SJeremy L Thompson switch (eval_mode) { 8444b3e95d5SJeremy L Thompson case CEED_EVAL_NONE: 8454b3e95d5SJeremy L Thompson code << " for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 8464b3e95d5SJeremy L Thompson code << " r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 8474b3e95d5SJeremy L Thompson code << " }\n"; 848*8b97b69aSJeremy L Thompson break; 8494b3e95d5SJeremy L Thompson case CEED_EVAL_INTERP: 8504b3e95d5SJeremy L Thompson code << " for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n"; 8514b3e95d5SJeremy L Thompson code << " r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n"; 8524b3e95d5SJeremy L Thompson code << " }\n"; 8534b3e95d5SJeremy L Thompson break; 8544b3e95d5SJeremy L Thompson case CEED_EVAL_GRAD: 855f815fac9SJeremy L Thompson code << " GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ">(data, q, r_s" << var_suffix << ", s_G" 856f815fac9SJeremy L Thompson << var_suffix << ", r_q" << var_suffix << ");\n"; 8574b3e95d5SJeremy L Thompson break; 8584b3e95d5SJeremy L Thompson // LCOV_EXCL_START 8594b3e95d5SJeremy L Thompson case CEED_EVAL_WEIGHT: 8604b3e95d5SJeremy L Thompson break; // Should not occur 8614b3e95d5SJeremy L Thompson case CEED_EVAL_DIV: 8624b3e95d5SJeremy L Thompson case CEED_EVAL_CURL: 8634b3e95d5SJeremy L Thompson break; // TODO: Not implemented 8644b3e95d5SJeremy L Thompson // LCOV_EXCL_STOP 8654b3e95d5SJeremy L Thompson } 8664b3e95d5SJeremy L Thompson } 8674b3e95d5SJeremy L Thompson } 8684b3e95d5SJeremy L Thompson code << " }\n"; 8694b3e95d5SJeremy L Thompson return CEED_ERROR_SUCCESS; 8704b3e95d5SJeremy L Thompson } 8714b3e95d5SJeremy L Thompson 8724b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------ 873b2165e7aSSebastian Grimberg // Build single operator kernel 874ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 875eb7e6cafSJeremy L Thompson extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { 876*8b97b69aSJeremy L Thompson bool is_tensor = true, is_at_points = false, use_3d_slices = false; 877241a4b83SYohann Ceed ceed; 878*8b97b69aSJeremy L Thompson CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0; 879ca735530SJeremy L Thompson CeedQFunctionField *qf_input_fields, *qf_output_fields; 880ca735530SJeremy L Thompson CeedQFunction_Cuda_gen *qf_data; 881ca735530SJeremy L Thompson CeedQFunction qf; 882ca735530SJeremy L Thompson CeedOperatorField *op_input_fields, *op_output_fields; 883ca735530SJeremy L Thompson CeedOperator_Cuda_gen *data; 8844b3e95d5SJeremy L Thompson std::ostringstream code; 8854b3e95d5SJeremy L Thompson 8864b3e95d5SJeremy L Thompson { 8874b3e95d5SJeremy L Thompson bool is_setup_done; 888ca735530SJeremy L Thompson 889ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); 890ca735530SJeremy L Thompson if (is_setup_done) return CEED_ERROR_SUCCESS; 8914b3e95d5SJeremy L Thompson } 892ca735530SJeremy L Thompson 893ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); 894ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetData(op, &data)); 895ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); 896ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); 897ca735530SJeremy L Thompson CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); 898ca735530SJeremy L Thompson CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); 899241a4b83SYohann 9004b3e95d5SJeremy L Thompson // Get operator data 901*8b97b69aSJeremy L Thompson CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points)); 9024b3e95d5SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields, 9034b3e95d5SJeremy L Thompson qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices)); 9044b3e95d5SJeremy L Thompson if (dim == 0) dim = 1; 9054b3e95d5SJeremy L Thompson data->dim = dim; 906*8b97b69aSJeremy L Thompson if (is_at_points) { 907*8b97b69aSJeremy L Thompson CeedElemRestriction_Cuda *rstr_data; 908*8b97b69aSJeremy L Thompson CeedElemRestriction rstr_points = NULL; 9094b3e95d5SJeremy L Thompson 910*8b97b69aSJeremy L Thompson CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL)); 911*8b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points)); 912*8b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride)); 913*8b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data)); 914*8b97b69aSJeremy L Thompson data->points.indices = (CeedInt *)rstr_data->d_offsets; 915*8b97b69aSJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points)); 916*8b97b69aSJeremy L Thompson } 917*8b97b69aSJeremy L Thompson if (is_at_points) use_3d_slices = false; 918*8b97b69aSJeremy L Thompson if (Q_1d == 0) { 919*8b97b69aSJeremy L Thompson if (is_at_points) Q_1d = max_num_points; 920*8b97b69aSJeremy L Thompson else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d)); 9214b3e95d5SJeremy L Thompson } 9224b3e95d5SJeremy L Thompson data->Q_1d = Q_1d; 9234b3e95d5SJeremy L Thompson 9240b454692Sjeremylt // Check for restriction only identity operator 9254b3e95d5SJeremy L Thompson { 9264b3e95d5SJeremy L Thompson bool is_identity_qf; 9274b3e95d5SJeremy L Thompson 9282b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf)); 9290b454692Sjeremylt if (is_identity_qf) { 9309e201c85SYohann CeedEvalMode eval_mode_in, eval_mode_out; 931ca735530SJeremy L Thompson 9322b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in)); 9332b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out)); 9346574a04fSJeremy L Thompson CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND, 9356574a04fSJeremy L Thompson "Backend does not implement restriction only identity operators"); 9360b454692Sjeremylt } 9374b3e95d5SJeremy L Thompson } 9380b454692Sjeremylt 939f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 9404b3e95d5SJeremy L Thompson { 9414b3e95d5SJeremy L Thompson Ceed_Cuda *ceed_data; 9424b3e95d5SJeremy L Thompson struct cudaDeviceProp prop; 9434b3e95d5SJeremy L Thompson 9442b730f8bSJeremy L Thompson CeedCallBackend(CeedGetData(ceed, &ceed_data)); 9452b730f8bSJeremy L Thompson CeedCallBackend(cudaGetDeviceProperties(&prop, ceed_data->device_id)); 94680a9ef05SNatalie Beams if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)) { 9479c25dd66SJeremy L Thompson code << "// AtomicAdd fallback source\n"; 9489c25dd66SJeremy L Thompson code << "#include <ceed/jit-source/cuda/cuda-atomic-add-fallback.h>\n\n"; 949f1a13f77SYohann Dudouit } 9504b3e95d5SJeremy L Thompson } 951f1a13f77SYohann Dudouit 9529e201c85SYohann // Load basis source files 9534b3e95d5SJeremy L Thompson // TODO: Add non-tensor, AtPoints 9549c25dd66SJeremy L Thompson code << "// Tensor basis source\n"; 9559c25dd66SJeremy L Thompson code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n"; 956*8b97b69aSJeremy L Thompson code << "// AtPoints basis source\n"; 957*8b97b69aSJeremy L Thompson code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n"; 9589c25dd66SJeremy L Thompson code << "// CodeGen operator source\n"; 9599c25dd66SJeremy L Thompson code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n"; 960241a4b83SYohann 9614b3e95d5SJeremy L Thompson // Get QFunction name 9624b3e95d5SJeremy L Thompson std::string qfunction_name(qf_data->qfunction_name); 9634b3e95d5SJeremy L Thompson std::string operator_name; 9644b3e95d5SJeremy L Thompson 96509095acaSJeremy L Thompson operator_name = "CeedKernelCudaGenOperator_" + qfunction_name; 9664d537eeaSYohann 9679e201c85SYohann // Define CEED_Q_VLA 9689e201c85SYohann code << "\n#undef CEED_Q_VLA\n"; 969*8b97b69aSJeremy L Thompson if (dim != 3 || is_at_points || use_3d_slices) { 9709e201c85SYohann code << "#define CEED_Q_VLA 1\n\n"; 9719e201c85SYohann } else { 9729e201c85SYohann code << "#define CEED_Q_VLA " << Q_1d << "\n\n"; 9739e201c85SYohann } 9749e201c85SYohann 9754b3e95d5SJeremy L Thompson // Add user QFunction source 9764b3e95d5SJeremy L Thompson { 9779c25dd66SJeremy L Thompson const char *source_path; 9784b3e95d5SJeremy L Thompson 9799c25dd66SJeremy L Thompson CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path)); 9809c25dd66SJeremy L Thompson CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/cuda/gen backend requires QFunction source code file"); 9819c25dd66SJeremy L Thompson 9829c25dd66SJeremy L Thompson code << "// User QFunction source\n"; 9839c25dd66SJeremy L Thompson code << "#include \"" << source_path << "\"\n\n"; 9844b3e95d5SJeremy L Thompson } 985241a4b83SYohann 986241a4b83SYohann // Setup 987818e0025SJeremy L Thompson code << "\n// -----------------------------------------------------------------------------\n"; 9884b3e95d5SJeremy L Thompson code << "// Operator Kernel\n"; 9894b3e95d5SJeremy L Thompson code << "// \n"; 9904b3e95d5SJeremy L Thompson code << "// d_[in,out]_i: CeedVector device array\n"; 9914b3e95d5SJeremy L Thompson code << "// r_[in,out]_e_i: Element vector register\n"; 9924b3e95d5SJeremy L Thompson code << "// r_[in,out]_q_i: Quadrature space vector register\n"; 993*8b97b69aSJeremy L Thompson code << "// r_[in,out]_c_i: AtPoints Chebyshev coefficents register\n"; 9944b3e95d5SJeremy L Thompson code << "// r_[in,out]_s_i: Quadrature space slice vector register\n"; 9954b3e95d5SJeremy L Thompson code << "// \n"; 9964b3e95d5SJeremy L Thompson code << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n"; 9974b3e95d5SJeremy L Thompson code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n"; 9984b3e95d5SJeremy L Thompson code << "// -----------------------------------------------------------------------------\n"; 9994b3e95d5SJeremy L Thompson code << "extern \"C\" __global__ void " << operator_name 1000*8b97b69aSJeremy L Thompson << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda " 1001*8b97b69aSJeremy L Thompson "points) {\n"; 10024b3e95d5SJeremy L Thompson 10034b3e95d5SJeremy L Thompson // Scratch buffers 10049e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 10054b3e95d5SJeremy L Thompson CeedEvalMode eval_mode; 10064b3e95d5SJeremy L Thompson 10072b730f8bSJeremy L Thompson CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); 10089e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 10094b3e95d5SJeremy L Thompson code << " const CeedScalar *d_in_" << i << " = fields.inputs[" << i << "];\n"; 1010241a4b83SYohann } 1011241a4b83SYohann } 10129e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 10134b3e95d5SJeremy L Thompson code << " CeedScalar *d_out_" << i << " = fields.outputs[" << i << "];\n"; 1014241a4b83SYohann } 10151da99368SJeremy L Thompson 10169e201c85SYohann code << " const CeedInt dim = " << dim << ";\n"; 10179e201c85SYohann code << " const CeedInt Q_1d = " << Q_1d << ";\n"; 1018*8b97b69aSJeremy L Thompson if (is_at_points) { 1019*8b97b69aSJeremy L Thompson code << " const CeedInt max_num_points = " << max_num_points << ";\n"; 1020*8b97b69aSJeremy L Thompson code << " const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n"; 1021*8b97b69aSJeremy L Thompson } 10221da99368SJeremy L Thompson 10234b3e95d5SJeremy L Thompson // Shared data 1024241a4b83SYohann code << " extern __shared__ CeedScalar slice[];\n"; 10259e201c85SYohann code << " SharedData_Cuda data;\n"; 10269e201c85SYohann code << " data.t_id_x = threadIdx.x;\n"; 10279e201c85SYohann code << " data.t_id_y = threadIdx.y;\n"; 10289e201c85SYohann code << " data.t_id_z = threadIdx.z;\n"; 10299e201c85SYohann code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 10309e201c85SYohann code << " data.slice = slice + data.t_id_z*T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n"; 1031920dcdc4Sjeremylt 1032ac421f39SYohann // Initialize constants, and matrices B and G 10334b3e95d5SJeremy L Thompson code << "\n // Input field constants and basis data\n"; 10349e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 1035*8b97b69aSJeremy L Thompson CeedCallBackend( 1036*8b97b69aSJeremy L Thompson CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_at_points, use_3d_slices)); 1037920dcdc4Sjeremylt } 10384b3e95d5SJeremy L Thompson code << "\n // Output field constants and basis data\n"; 10399e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 1040*8b97b69aSJeremy L Thompson CeedCallBackend( 1041*8b97b69aSJeremy L Thompson CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, use_3d_slices)); 10424b3e95d5SJeremy L Thompson } 1043920dcdc4Sjeremylt 10444b3e95d5SJeremy L Thompson // Loop over all elements 10454b3e95d5SJeremy L Thompson code << "\n // Element loop\n"; 1046ac421f39SYohann code << " __syncthreads();\n"; 10479e201c85SYohann code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 10484b3e95d5SJeremy L Thompson 1049e93651e5SJeremy L Thompson // -- Compute minimum buffer space needed 1050*8b97b69aSJeremy L Thompson CeedInt max_rstr_buffer_size = 1; 1051e93651e5SJeremy L Thompson 10529e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 1053e93651e5SJeremy L Thompson CeedInt num_comp, elem_size; 1054e93651e5SJeremy L Thompson CeedElemRestriction elem_rstr; 1055e93651e5SJeremy L Thompson 1056e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr)); 1057e93651e5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1058e93651e5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 1059e93651e5SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size); 1060681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1061e93651e5SJeremy L Thompson } 1062e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_output_fields; i++) { 1063e93651e5SJeremy L Thompson CeedInt num_comp, elem_size; 1064e93651e5SJeremy L Thompson CeedElemRestriction elem_rstr; 1065e93651e5SJeremy L Thompson 1066e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr)); 1067e93651e5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp)); 1068e93651e5SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size)); 1069e93651e5SJeremy L Thompson max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size); 1070681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr)); 1071e93651e5SJeremy L Thompson } 1072e93651e5SJeremy L Thompson code << " // Scratch restriction buffer space\n"; 1073e93651e5SJeremy L Thompson code << " CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n"; 1074e93651e5SJeremy L Thompson 1075e93651e5SJeremy L Thompson // -- Determine best input field processing order 1076e93651e5SJeremy L Thompson CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX]; 1077e93651e5SJeremy L Thompson 1078e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1079e93651e5SJeremy L Thompson field_rstr_in_buffer[i] = -1; 1080e93651e5SJeremy L Thompson input_field_order[i] = -1; 1081e93651e5SJeremy L Thompson } 1082e93651e5SJeremy L Thompson { 1083e93651e5SJeremy L Thompson bool is_ordered[CEED_FIELD_MAX]; 1084e93651e5SJeremy L Thompson CeedInt curr_index = 0; 1085e93651e5SJeremy L Thompson 1086e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false; 1087e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1088e93651e5SJeremy L Thompson CeedVector vec_i; 1089e93651e5SJeremy L Thompson CeedElemRestriction rstr_i; 1090e93651e5SJeremy L Thompson 1091e93651e5SJeremy L Thompson if (is_ordered[i]) continue; 1092e93651e5SJeremy L Thompson field_rstr_in_buffer[i] = i; 1093e93651e5SJeremy L Thompson is_ordered[i] = true; 1094e93651e5SJeremy L Thompson input_field_order[curr_index] = i; 1095e93651e5SJeremy L Thompson curr_index++; 1096034f99fdSJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i)); 1097e93651e5SJeremy L Thompson if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT 1098e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i)); 1099e93651e5SJeremy L Thompson for (CeedInt j = i + 1; j < num_input_fields; j++) { 1100e93651e5SJeremy L Thompson CeedVector vec_j; 1101e93651e5SJeremy L Thompson CeedElemRestriction rstr_j; 1102e93651e5SJeremy L Thompson 1103e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j)); 1104e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j)); 1105e93651e5SJeremy L Thompson if (rstr_i == rstr_j && vec_i == vec_j) { 1106e93651e5SJeremy L Thompson field_rstr_in_buffer[j] = i; 1107e93651e5SJeremy L Thompson is_ordered[j] = true; 1108e93651e5SJeremy L Thompson input_field_order[curr_index] = j; 1109e93651e5SJeremy L Thompson curr_index++; 1110e93651e5SJeremy L Thompson } 1111681d0ea7SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_j)); 1112681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j)); 1113e93651e5SJeremy L Thompson } 1114681d0ea7SJeremy L Thompson CeedCallBackend(CeedVectorDestroy(&vec_i)); 1115681d0ea7SJeremy L Thompson CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i)); 1116e93651e5SJeremy L Thompson } 1117e93651e5SJeremy L Thompson } 1118e93651e5SJeremy L Thompson 1119e93651e5SJeremy L Thompson // -- Input restriction and basis 1120e93651e5SJeremy L Thompson code << "\n // -- Input field restrictions and basis actions\n"; 1121e93651e5SJeremy L Thompson for (CeedInt i = 0; i < num_input_fields; i++) { 1122e93651e5SJeremy L Thompson CeedInt f = input_field_order[i]; 1123e93651e5SJeremy L Thompson 1124e93651e5SJeremy L Thompson code << " // ---- Input field " << f << "\n"; 1125920dcdc4Sjeremylt 11264b3e95d5SJeremy L Thompson // ---- Restriction 1127e93651e5SJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], 1128*8b97b69aSJeremy L Thompson Q_1d, true, is_at_points, use_3d_slices)); 11299a2291e3SJeremy L Thompson 11304b3e95d5SJeremy L Thompson // ---- Basis action 1131*8b97b69aSJeremy L Thompson CeedCallBackend( 1132*8b97b69aSJeremy L Thompson CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_at_points, use_3d_slices)); 1133920dcdc4Sjeremylt } 1134920dcdc4Sjeremylt 11354b3e95d5SJeremy L Thompson // -- Q function 1136*8b97b69aSJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, max_num_points, num_input_fields, op_input_fields, qf_input_fields, 1137*8b97b69aSJeremy L Thompson num_output_fields, op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_at_points, 1138*8b97b69aSJeremy L Thompson use_3d_slices)); 1139ca735530SJeremy L Thompson 11404b3e95d5SJeremy L Thompson // -- Output basis and restriction 11414b3e95d5SJeremy L Thompson code << "\n // -- Output field basis action and restrictions\n"; 11429e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 11434b3e95d5SJeremy L Thompson code << " // ---- Output field " << i << "\n"; 1144ca735530SJeremy L Thompson 11454b3e95d5SJeremy L Thompson // ---- Basis action 1146*8b97b69aSJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, 1147*8b97b69aSJeremy L Thompson use_3d_slices)); 11489a2291e3SJeremy L Thompson 11494b3e95d5SJeremy L Thompson // ---- Restriction 1150*8b97b69aSJeremy L Thompson CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, 1151*8b97b69aSJeremy L Thompson is_at_points, use_3d_slices)); 1152ac421f39SYohann } 1153241a4b83SYohann 11544b3e95d5SJeremy L Thompson // Close loop and function 1155241a4b83SYohann code << " }\n"; 1156818e0025SJeremy L Thompson code << "}\n"; 1157818e0025SJeremy L Thompson code << "// -----------------------------------------------------------------------------\n\n"; 1158241a4b83SYohann 1159ab213215SJeremy L Thompson // View kernel for debugging 1160eb7e6cafSJeremy L Thompson CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d))); 1161eb7e6cafSJeremy L Thompson CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op)); 11622b730f8bSJeremy L Thompson CeedCallBackend(CeedOperatorSetSetupDone(op)); 11639bc66399SJeremy L Thompson CeedCallBackend(CeedDestroy(&ceed)); 1164c11e12f4SJeremy L Thompson CeedCallBackend(CeedQFunctionDestroy(&qf)); 1165e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 1166241a4b83SYohann } 11672a86cc9dSSebastian Grimberg 1168ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 1169