13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 37d8d0e25Snbeams // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 57d8d0e25Snbeams // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 73d576824SJeremy L Thompson 87d8d0e25Snbeams #define CEED_DEBUG_COLOR 12 97d8d0e25Snbeams 10ec3da8bcSJed Brown #include <ceed/ceed.h> 11ec3da8bcSJed Brown #include <ceed/backend.h> 129e201c85SYohann #include <ceed/jit-tools.h> 137d8d0e25Snbeams #include <iostream> 143d576824SJeremy L Thompson #include <string> 157d8d0e25Snbeams #include <sstream> 163d576824SJeremy L Thompson #include "ceed-hip-gen.h" 170d0321e0SJeremy L Thompson #include "../hip-ref/ceed-hip-ref.h" 187d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h" 197d8d0e25Snbeams #include "../hip/ceed-hip-compile.h" 207d8d0e25Snbeams 21b3e1519bSnbeams 22b3e1519bSnbeams //------------------------------------------------------------------------------ 23b3e1519bSnbeams // Calculate the block size used for launching the operator kernel 24b3e1519bSnbeams //------------------------------------------------------------------------------ 259e201c85SYohann extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, 269e201c85SYohann const CeedInt P_1d, const CeedInt Q_1d, 27b3e1519bSnbeams CeedInt *block_sizes) { 28b3e1519bSnbeams 299e201c85SYohann const CeedInt thread1d = CeedIntMax(Q_1d, P_1d); 30b3e1519bSnbeams if (dim==1) { 319e201c85SYohann CeedInt elems_per_block = 64*thread1d > 256? 256/thread1d : 64; 329e201c85SYohann elems_per_block = elems_per_block>0?elems_per_block:1; 33b3e1519bSnbeams block_sizes[0] = thread1d; 34b3e1519bSnbeams block_sizes[1] = 1; 359e201c85SYohann block_sizes[2] = elems_per_block; 36b3e1519bSnbeams } else if (dim==2) { 379e201c85SYohann const CeedInt elems_per_block = thread1d<4? 16 : 2; 38b3e1519bSnbeams block_sizes[0] = thread1d; 39b3e1519bSnbeams block_sizes[1] = thread1d; 409e201c85SYohann block_sizes[2] = elems_per_block; 41b3e1519bSnbeams } else if (dim==3) { 429e201c85SYohann const CeedInt elems_per_block = thread1d<6? 4 : (thread1d<8? 2 : 1); 43b3e1519bSnbeams block_sizes[0] = thread1d; 44b3e1519bSnbeams block_sizes[1] = thread1d; 459e201c85SYohann block_sizes[2] = elems_per_block; 46b3e1519bSnbeams } 47b3e1519bSnbeams return CEED_ERROR_SUCCESS; 48b3e1519bSnbeams } 49b3e1519bSnbeams 507d8d0e25Snbeams //------------------------------------------------------------------------------ 519e201c85SYohann // Build single operator kernel 527d8d0e25Snbeams //------------------------------------------------------------------------------ 5337c3b1cfSnbeams extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { 547d8d0e25Snbeams 557d8d0e25Snbeams using std::ostringstream; 567d8d0e25Snbeams using std::string; 577d8d0e25Snbeams int ierr; 589e201c85SYohann bool is_setup_done; 599e201c85SYohann ierr = CeedOperatorIsSetupDone(op, &is_setup_done); CeedChkBackend(ierr); 609e201c85SYohann if (is_setup_done) return CEED_ERROR_SUCCESS; 617d8d0e25Snbeams Ceed ceed; 62e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 637d8d0e25Snbeams CeedOperator_Hip_gen *data; 64e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 657d8d0e25Snbeams CeedQFunction qf; 667d8d0e25Snbeams CeedQFunction_Hip_gen *qf_data; 67e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 68e15f9bd0SJeremy L Thompson ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 69e79b91d9SJeremy L Thompson CeedSize lsize; 709e201c85SYohann CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, 719e201c85SYohann num_output_fields, num_comp, dim = 1; 72e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 739e201c85SYohann Q_1d = Q; 749e201c85SYohann CeedOperatorField *op_input_fields, *op_output_fields; 759e201c85SYohann ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields); 76e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 779e201c85SYohann CeedQFunctionField *qf_input_fields, *qf_output_fields; 789e201c85SYohann ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields); 79e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 809e201c85SYohann CeedEvalMode eval_mode; 817d8d0e25Snbeams CeedBasis basis; 827d8d0e25Snbeams CeedBasis_Hip_shared *basis_data; 837d8d0e25Snbeams CeedElemRestriction Erestrict; 847d8d0e25Snbeams CeedElemRestriction_Hip *restr_data; 857d8d0e25Snbeams 860b454692Sjeremylt // Check for restriction only identity operator 870b454692Sjeremylt bool is_identity_qf; 880b454692Sjeremylt ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 890b454692Sjeremylt if (is_identity_qf) { 909e201c85SYohann CeedEvalMode eval_mode_in, eval_mode_out; 919e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in); CeedChkBackend(ierr); 929e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out); CeedChkBackend(ierr); 939e201c85SYohann if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) 940b454692Sjeremylt // LCOV_EXCL_START 950b454692Sjeremylt return CeedError(ceed, CEED_ERROR_BACKEND, 960b454692Sjeremylt "Backend does not implement restriction only identity operators"); 970b454692Sjeremylt // LCOV_EXCL_STOP 980b454692Sjeremylt } 990b454692Sjeremylt 1007d8d0e25Snbeams ostringstream code; 1019e201c85SYohann // TODO: generalize to accept different device functions? 1029e201c85SYohann { 1039e201c85SYohann char *tensor_basis_kernel_path, *tensor_basis_kernel_source; 1049e201c85SYohann ierr = CeedGetJitAbsolutePath(ceed, 1059e201c85SYohann "ceed/jit-source/hip/hip-shared-basis-tensor-templates.h", 1069e201c85SYohann &tensor_basis_kernel_path); CeedChkBackend(ierr); 1079e201c85SYohann CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n"); 1089e201c85SYohann ierr = CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source); 1099e201c85SYohann CeedChkBackend(ierr); 1109e201c85SYohann code << tensor_basis_kernel_source; 1119e201c85SYohann ierr = CeedFree(&tensor_basis_kernel_path); CeedChkBackend(ierr); 1129e201c85SYohann ierr = CeedFree(&tensor_basis_kernel_source); CeedChkBackend(ierr); 1139e201c85SYohann } 1149e201c85SYohann { 1159e201c85SYohann char *hip_gen_template_path, *hip_gen_template_source; 1169e201c85SYohann ierr = CeedGetJitAbsolutePath(ceed, 1179e201c85SYohann "ceed/jit-source/hip/hip-gen-templates.h", 1189e201c85SYohann &hip_gen_template_path); CeedChkBackend(ierr); 1199e201c85SYohann CeedDebug256(ceed, 2, "----- Loading Hip-Gen Template Source -----\n"); 1209e201c85SYohann ierr = CeedLoadSourceToBuffer(ceed, hip_gen_template_path, &hip_gen_template_source); 1219e201c85SYohann CeedChkBackend(ierr); 1229e201c85SYohann code << hip_gen_template_source; 1239e201c85SYohann ierr = CeedFree(&hip_gen_template_path); CeedChkBackend(ierr); 1249e201c85SYohann ierr = CeedFree(&hip_gen_template_source); CeedChkBackend(ierr); 1259e201c85SYohann } 1267d8d0e25Snbeams 1279e201c85SYohann string q_function_source(qf_data->q_function_source); 1289e201c85SYohann string q_function_name(qf_data->q_function_name); 1299e201c85SYohann string operator_name; 130*204bfdd7SJeremy L Thompson operator_name = "CeedKernelHipGenOperator_" + q_function_name; 1317d8d0e25Snbeams 1329e201c85SYohann // Find dim, P_1d, Q_1d 1339e201c85SYohann data->max_P_1d = 0; 1349e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 1359e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 1367d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 137e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1389e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 139e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1407d8d0e25Snbeams 1419e201c85SYohann // Collect dim, P_1d, and Q_1d 142e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 1437d8d0e25Snbeams bool isTensor; 144e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 1457d8d0e25Snbeams if (isTensor) { 1469e201c85SYohann ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 1479e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 1489e201c85SYohann if (P_1d > data->max_P_1d) data->max_P_1d = P_1d; 1497d8d0e25Snbeams } else { 1507d8d0e25Snbeams // LCOV_EXCL_START 151e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 1527d8d0e25Snbeams // LCOV_EXCL_STOP 1537d8d0e25Snbeams } 1547d8d0e25Snbeams } 1557d8d0e25Snbeams } 1569e201c85SYohann // Check output bases for Q_1d, dim as well 157dc64899eSYohann // The only input basis might be CEED_BASIS_COLLOCATED 1589e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 1599e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 1607d8d0e25Snbeams 1617d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 162e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1639e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 164e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1657d8d0e25Snbeams 1669e201c85SYohann // Collect Q_1d 167e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 1687d8d0e25Snbeams bool isTensor; 169e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 1707d8d0e25Snbeams if (isTensor) { 1719e201c85SYohann ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 1727d8d0e25Snbeams } else { 1737d8d0e25Snbeams // LCOV_EXCL_START 174e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 1757d8d0e25Snbeams // LCOV_EXCL_STOP 1767d8d0e25Snbeams } 1777d8d0e25Snbeams } 1787d8d0e25Snbeams } 1797d8d0e25Snbeams data->dim = dim; 1809e201c85SYohann data->Q_1d = Q_1d; 1817d8d0e25Snbeams 1829e201c85SYohann // Only use 3D collocated gradient parallelization strategy when gradient is computed 1839e201c85SYohann // TODO: put in a function? 1849e201c85SYohann bool use_collograd_parallelization = false; 1859e201c85SYohann if (dim == 3) { 1869e201c85SYohann bool was_grad_found = false; 1879e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 1889e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 1899e201c85SYohann if (eval_mode == CEED_EVAL_GRAD) { 1909e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 1919e201c85SYohann ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1929e201c85SYohann use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 1939e201c85SYohann was_grad_found = true; 1949e201c85SYohann } 1959e201c85SYohann } 1969e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 1979e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 1989e201c85SYohann if (eval_mode == CEED_EVAL_GRAD) { 1999e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 2009e201c85SYohann ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 2019e201c85SYohann use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 2029e201c85SYohann was_grad_found = true; 2039e201c85SYohann } 2049e201c85SYohann } 2057d8d0e25Snbeams } 2067d8d0e25Snbeams 2079e201c85SYohann // Define CEED_Q_VLA 2089e201c85SYohann code << "\n#undef CEED_Q_VLA\n"; 2099e201c85SYohann if (dim != 3 || use_collograd_parallelization) { 2109e201c85SYohann code << "#define CEED_Q_VLA 1\n\n"; 2119e201c85SYohann } else { 2129e201c85SYohann code << "#define CEED_Q_VLA "<<Q_1d<<"\n\n"; 2139e201c85SYohann } 2149e201c85SYohann 2159e201c85SYohann code << q_function_source; 2167d8d0e25Snbeams 2177d8d0e25Snbeams // Setup 2187d8d0e25Snbeams code << "\n// -----------------------------------------------------------------------------\n"; 219b3e1519bSnbeams code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; 2209e201c85SYohann code << "__global__ void "<<operator_name<<"(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W) {\n"; 2219e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 2229e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 223e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2249e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 2259e201c85SYohann code << " const CeedScalar* d_u_" <<i<<" = fields.inputs["<<i<<"];\n"; 2267d8d0e25Snbeams } 2277d8d0e25Snbeams } 2287d8d0e25Snbeams 2299e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 2309e201c85SYohann code << " CeedScalar* d_v_"<<i<<" = fields.outputs["<<i<<"];\n"; 2317d8d0e25Snbeams } 2327d8d0e25Snbeams 2339e201c85SYohann code << " const CeedInt dim = "<<dim<<";\n"; 2349e201c85SYohann code << " const CeedInt Q_1d = "<<Q_1d<<";\n"; 2357d8d0e25Snbeams 2367d8d0e25Snbeams code << " HIP_DYNAMIC_SHARED( CeedScalar, slice)\n"; 2379e201c85SYohann code << " SharedData_Hip data;\n"; 2389e201c85SYohann code << " data.t_id_x = threadIdx.x;\n"; 2399e201c85SYohann code << " data.t_id_y = threadIdx.y;\n"; 2409e201c85SYohann code << " data.t_id_z = threadIdx.z;\n"; 2419e201c85SYohann code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 2429e201c85SYohann code << " data.slice = slice+data.t_id_z*T_1D"<<(dim>1?"*T_1D":"")<<";\n"; 2437d8d0e25Snbeams 2447d8d0e25Snbeams code << "\n // -- Input field constants and basis data --\n"; 2457d8d0e25Snbeams //Initialize constants, and matrices B and G 2469e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 2477d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 2489e201c85SYohann // Get elem_size, eval_mode, num_comp 2499e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); 250e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2519e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 252e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2539e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 254e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2559e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 256e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2577d8d0e25Snbeams 2587d8d0e25Snbeams // Set field constants 2599e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT) { 2609e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 2617d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 2629e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 2639e201c85SYohann code << " const CeedInt P_in_"<<i<<" = "<<P_1d<<";\n"; 2647d8d0e25Snbeams } else { 2659e201c85SYohann code << " const CeedInt P_in_"<<i<<" = "<<Q_1d<<";\n"; 2667d8d0e25Snbeams } 2679e201c85SYohann code << " const CeedInt num_comp_in_"<<i<<" = "<<num_comp<<";\n"; 2687d8d0e25Snbeams } 2697d8d0e25Snbeams 2707d8d0e25Snbeams // Load basis data 2719e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 2729e201c85SYohann switch (eval_mode) { 2737d8d0e25Snbeams case CEED_EVAL_NONE: 2747d8d0e25Snbeams break; 2757d8d0e25Snbeams case CEED_EVAL_INTERP: 276e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 2779e201c85SYohann data->B.inputs[i] = basis_data->d_interp_1d; 2789e201c85SYohann code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 2799e201c85SYohann code << " loadMatrix<P_in_"<<i<<",Q_1d>(data, B.inputs["<<i<<"], s_B_in_"<<i<<");\n"; 2807d8d0e25Snbeams break; 2817d8d0e25Snbeams case CEED_EVAL_GRAD: 282e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 2839e201c85SYohann data->B.inputs[i] = basis_data->d_interp_1d; 2849e201c85SYohann code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 2859e201c85SYohann code << " loadMatrix<P_in_"<<i<<",Q_1d>(data, B.inputs["<<i<<"], s_B_in_"<<i<<");\n"; 2869e201c85SYohann if (use_collograd_parallelization) { 2879e201c85SYohann data->G.inputs[i] = basis_data->d_collo_grad_1d; 2889e201c85SYohann code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q_1d*Q_1d<<"];\n"; 2899e201c85SYohann code << " loadMatrix<Q_1d,Q_1d>(data, G.inputs["<<i<<"], s_G_in_"<<i<<");\n"; 2907d8d0e25Snbeams } else { 2919e201c85SYohann bool has_collo_grad = !!basis_data->d_collo_grad_1d; 2929e201c85SYohann data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 2939e201c85SYohann code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q_1d*(has_collo_grad?Q_1d:P_1d)<<"];\n"; 2949e201c85SYohann code << " loadMatrix<"<<(has_collo_grad?"Q_1d":("P_in_"+std::to_string(i)))<<",Q_1d>(data, G.inputs["<<i<<"], s_G_in_"<<i<<");\n"; 2957d8d0e25Snbeams } 2967d8d0e25Snbeams break; 2977d8d0e25Snbeams case CEED_EVAL_WEIGHT: 2987d8d0e25Snbeams break; // No action 2997d8d0e25Snbeams case CEED_EVAL_DIV: 3007d8d0e25Snbeams break; // TODO: Not implemented 3017d8d0e25Snbeams case CEED_EVAL_CURL: 3027d8d0e25Snbeams break; // TODO: Not implemented 3037d8d0e25Snbeams } 3047d8d0e25Snbeams } 3057d8d0e25Snbeams 3067d8d0e25Snbeams code << "\n // -- Output field constants and basis data --\n"; 3079e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 3087d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 3099e201c85SYohann // Get elem_size, eval_mode, num_comp 3109e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict); 311e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3129e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 313e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3149e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 315e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3169e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 317e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3187d8d0e25Snbeams 3197d8d0e25Snbeams // Set field constants 3209e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 3217d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 3229e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 3239e201c85SYohann code << " const CeedInt P_out_"<<i<<" = "<<P_1d<<";\n"; 3247d8d0e25Snbeams } else { 3259e201c85SYohann code << " const CeedInt P_out_"<<i<<" = "<<Q_1d<<";\n"; 3267d8d0e25Snbeams } 3279e201c85SYohann code << " const CeedInt num_comp_out_"<<i<<" = "<<num_comp<<";\n"; 3287d8d0e25Snbeams 3297d8d0e25Snbeams // Load basis data 3309e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 3319e201c85SYohann switch (eval_mode) { 3327d8d0e25Snbeams case CEED_EVAL_NONE: 3337d8d0e25Snbeams break; // No action 3347d8d0e25Snbeams case CEED_EVAL_INTERP: 335e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 3369e201c85SYohann data->B.outputs[i] = basis_data->d_interp_1d; 3379e201c85SYohann code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 3389e201c85SYohann code << " loadMatrix<P_out_"<<i<<",Q_1d>(data, B.outputs["<<i<<"], s_B_out_"<<i<<");\n"; 3397d8d0e25Snbeams break; 3407d8d0e25Snbeams case CEED_EVAL_GRAD: 341e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 3429e201c85SYohann data->B.outputs[i] = basis_data->d_interp_1d; 3439e201c85SYohann code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 3449e201c85SYohann code << " loadMatrix<P_out_"<<i<<",Q_1d>(data, B.outputs["<<i<<"], s_B_out_"<<i<<");\n"; 3459e201c85SYohann if (use_collograd_parallelization) { 3469e201c85SYohann data->G.outputs[i] = basis_data->d_collo_grad_1d; 3479e201c85SYohann code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q_1d*Q_1d<<"];\n"; 3489e201c85SYohann code << " loadMatrix<Q_1d,Q_1d>(data, G.outputs["<<i<<"], s_G_out_"<<i<<");\n"; 3497d8d0e25Snbeams } else { 3509e201c85SYohann bool has_collo_grad = !!basis_data->d_collo_grad_1d; 3519e201c85SYohann data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 3529e201c85SYohann code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q_1d*(has_collo_grad?Q_1d:P_1d)<<"];\n"; 3539e201c85SYohann code << " loadMatrix<"<<(has_collo_grad?"Q_1d":("P_out_"+std::to_string(i)))<<",Q_1d>(data, G.outputs["<<i<<"], s_G_out_"<<i<<");\n"; 3547d8d0e25Snbeams } 3557d8d0e25Snbeams break; 3567d8d0e25Snbeams // LCOV_EXCL_START 3577d8d0e25Snbeams case CEED_EVAL_WEIGHT: { 3587d8d0e25Snbeams Ceed ceed; 359e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 360e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 3617d8d0e25Snbeams "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 3627d8d0e25Snbeams break; // Should not occur 3637d8d0e25Snbeams } 3647d8d0e25Snbeams case CEED_EVAL_DIV: 3657d8d0e25Snbeams break; // TODO: Not implemented 3667d8d0e25Snbeams case CEED_EVAL_CURL: 3677d8d0e25Snbeams break; // TODO: Not implemented 3687d8d0e25Snbeams // LCOV_EXCL_STOP 3697d8d0e25Snbeams } 3707d8d0e25Snbeams } 3717d8d0e25Snbeams code << "\n // -- Element loop --\n"; 3727d8d0e25Snbeams code << " __syncthreads();\n"; 3739e201c85SYohann code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 3747d8d0e25Snbeams // Input basis apply if needed 3757d8d0e25Snbeams // Generate the correct eval mode code for each input 3767d8d0e25Snbeams code << " // -- Input field restrictions and basis actions --\n"; 3779e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 3787d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 3799e201c85SYohann // Get elem_size, eval_mode, num_comp 3809e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); 381e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3829e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 383e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3849e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 385e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3869e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 387e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3887d8d0e25Snbeams 3897d8d0e25Snbeams // Restriction 3909e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT && 3919e201c85SYohann !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { 3929e201c85SYohann code << " CeedScalar r_u_"<<i<<"[num_comp_in_"<<i<<"*P_in_"<<i<<"];\n"; 3937d8d0e25Snbeams 3949e201c85SYohann bool is_strided; 3959e201c85SYohann ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 3969e201c85SYohann if (!is_strided) { 3977d8d0e25Snbeams ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 398e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3997d8d0e25Snbeams code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 4009e201c85SYohann CeedInt comp_stride; 4019e201c85SYohann ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 4029e201c85SYohann code << " // CompStride: "<<comp_stride<<"\n"; 403e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 4049e201c85SYohann data->indices.inputs[i] = restr_data->d_ind; 4059e201c85SYohann code << " readDofsOffset"<<dim<<"d<num_comp_in_"<<i<<", "<<comp_stride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.inputs["<<i<<"], d_u_"<<i<<", r_u_"<<i<<");\n"; 4067d8d0e25Snbeams } else { 4079e201c85SYohann bool has_backend_strides; 4089e201c85SYohann ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); 409e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 4109e201c85SYohann CeedInt num_elem; 4119e201c85SYohann ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 412e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 4139e201c85SYohann CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 4149e201c85SYohann if (!has_backend_strides) { 4157d8d0e25Snbeams ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 416e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 4177d8d0e25Snbeams } 4187d8d0e25Snbeams code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 4199e201c85SYohann code << " readDofsStrided"<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u_"<<i<<", r_u_"<<i<<");\n"; 4207d8d0e25Snbeams } 4217d8d0e25Snbeams } 4227d8d0e25Snbeams 4237d8d0e25Snbeams // Basis action 4249e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 4259e201c85SYohann switch (eval_mode) { 4267d8d0e25Snbeams case CEED_EVAL_NONE: 4279e201c85SYohann if (!use_collograd_parallelization) { 4289e201c85SYohann code << " CeedScalar* r_t_"<<i<<" = r_u_"<<i<<";\n"; 4297d8d0e25Snbeams } 4307d8d0e25Snbeams break; 4317d8d0e25Snbeams case CEED_EVAL_INTERP: 4329e201c85SYohann code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*Q_1d];\n"; 4339e201c85SYohann code << " Interp"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", r_t_"<<i<<");\n"; 4347d8d0e25Snbeams break; 4357d8d0e25Snbeams case CEED_EVAL_GRAD: 4369e201c85SYohann if (use_collograd_parallelization) { 4379e201c85SYohann code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*Q_1d];\n"; 4389e201c85SYohann code << " Interp"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", r_t_"<<i<<");\n"; 4397d8d0e25Snbeams } else { 4409e201c85SYohann CeedInt P_1d; 4419e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 4429e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 4439e201c85SYohann code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*dim*Q_1d];\n"; 4449e201c85SYohann code << " Grad"<<(dim>1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t_"<<i<<");\n"; 4457d8d0e25Snbeams } 4467d8d0e25Snbeams break; 4477d8d0e25Snbeams case CEED_EVAL_WEIGHT: 4489e201c85SYohann code << " CeedScalar r_t_"<<i<<"[Q_1d];\n"; 4499e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 450e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 451437930d1SJeremy L Thompson data->W = basis_data->d_q_weight_1d; 4529e201c85SYohann code << " Weight"<<(dim>1?"Tensor":"")<<dim<<"d<Q_1d>(data, W, r_t_"<<i<<");\n"; 4537d8d0e25Snbeams break; // No action 4547d8d0e25Snbeams case CEED_EVAL_DIV: 4557d8d0e25Snbeams break; // TODO: Not implemented 4567d8d0e25Snbeams case CEED_EVAL_CURL: 4577d8d0e25Snbeams break; // TODO: Not implemented 4587d8d0e25Snbeams } 4597d8d0e25Snbeams } 4607d8d0e25Snbeams 4617d8d0e25Snbeams // Q function 4627d8d0e25Snbeams code << "\n // -- Output field setup --\n"; 4639e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 4647d8d0e25Snbeams code << "\n // ---- Output field "<<i<<" ----\n"; 4659e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 466e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 4679e201c85SYohann if (eval_mode==CEED_EVAL_GRAD) 4687d8d0e25Snbeams { 4699e201c85SYohann if (use_collograd_parallelization) { 4707d8d0e25Snbeams //Accumulator for gradient slices 4719e201c85SYohann code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*Q_1d];\n"; 4729e201c85SYohann code << " for (CeedInt i = 0; i < num_comp_out_"<<i<<"; i++) {\n"; 4739e201c85SYohann code << " for (CeedInt j = 0; j < Q_1d; ++j) {\n"; 4749e201c85SYohann code << " r_tt_"<<i<<"[j + i*Q_1d] = 0.0;\n"; 4757d8d0e25Snbeams code << " }\n"; 4767d8d0e25Snbeams code << " }\n"; 4777d8d0e25Snbeams } else { 4789e201c85SYohann code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*dim*Q_1d];\n"; 4797d8d0e25Snbeams } 4807d8d0e25Snbeams } 4819e201c85SYohann if (eval_mode==CEED_EVAL_NONE || eval_mode==CEED_EVAL_INTERP) 4827d8d0e25Snbeams { 4839e201c85SYohann code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*Q_1d];\n"; 4847d8d0e25Snbeams } 4857d8d0e25Snbeams } 4867d8d0e25Snbeams // We treat quadrature points per slice in 3d to save registers 4879e201c85SYohann if (use_collograd_parallelization) { 4889e201c85SYohann code << "\n // Note: Using planes of 3D elements\n"; 4897d8d0e25Snbeams code << "#pragma unroll\n"; 4909e201c85SYohann code << " for (CeedInt q = 0; q < Q_1d; q++) {\n"; 4917d8d0e25Snbeams code << " // -- Input fields --\n"; 4929e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 4937d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 4949e201c85SYohann // Get elem_size, eval_mode, num_comp 4959e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 496e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 4977d8d0e25Snbeams // Basis action 4989e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 4999e201c85SYohann switch (eval_mode) { 5007d8d0e25Snbeams case CEED_EVAL_NONE: 5019e201c85SYohann code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"];\n"; 5027d8d0e25Snbeams 5039e201c85SYohann bool is_strided; 5049e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); CeedChkBackend(ierr); 5059e201c85SYohann ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 5069e201c85SYohann if (!is_strided) { 5077d8d0e25Snbeams ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 508e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 5097d8d0e25Snbeams code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 5109e201c85SYohann CeedInt comp_stride; 5119e201c85SYohann ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 5129e201c85SYohann code << " // CompStride: "<<comp_stride<<"\n"; 513e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 5149e201c85SYohann data->indices.inputs[i] = restr_data->d_ind; 5159e201c85SYohann code << " readSliceQuadsOffset"<<"3d<num_comp_in_"<<i<<", "<<comp_stride<<", Q_1d>(data, lsize_in_"<<i<<", elem, q, indices.inputs["<<i<<"], d_u_"<<i<<", r_q_"<<i<<");\n"; 5167d8d0e25Snbeams } else { 5179e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); CeedChkBackend(ierr); 5189e201c85SYohann bool has_backend_strides; 5199e201c85SYohann ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); 520e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 5219e201c85SYohann CeedInt num_elem; 5229e201c85SYohann ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 523e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 5249e201c85SYohann CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 5259e201c85SYohann if (!has_backend_strides) { 5267d8d0e25Snbeams ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 527e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 5287d8d0e25Snbeams } 5297d8d0e25Snbeams code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 5309e201c85SYohann code << " readSliceQuadsStrided"<<"3d<num_comp_in_"<<i<<",Q_1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u_"<<i<<", r_q_"<<i<<");\n"; 5317d8d0e25Snbeams } 5327d8d0e25Snbeams break; 5337d8d0e25Snbeams case CEED_EVAL_INTERP: 5349e201c85SYohann code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"];\n"; 5359e201c85SYohann code << " for (CeedInt j = 0; j < num_comp_in_"<<i<<" ; ++j) {\n"; 5369e201c85SYohann code << " r_q_"<<i<<"[j] = r_t_"<<i<<"[q + j*Q_1d];\n"; 5377d8d0e25Snbeams code << " }\n"; 5387d8d0e25Snbeams break; 5397d8d0e25Snbeams case CEED_EVAL_GRAD: 5409e201c85SYohann code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"*dim];\n"; 5419e201c85SYohann code << " gradCollo3d<num_comp_in_"<<i<<",Q_1d>(data, q, r_t_"<<i<<", s_G_in_"<<i<<", r_q_"<<i<<");\n"; 5427d8d0e25Snbeams break; 5437d8d0e25Snbeams case CEED_EVAL_WEIGHT: 5449e201c85SYohann code << " CeedScalar r_q_"<<i<<"[1];\n"; 5459e201c85SYohann code << " r_q_"<<i<<"[0] = r_t_"<<i<<"[q];\n"; 5467d8d0e25Snbeams break; // No action 5477d8d0e25Snbeams case CEED_EVAL_DIV: 5487d8d0e25Snbeams break; // TODO: Not implemented 5497d8d0e25Snbeams case CEED_EVAL_CURL: 5507d8d0e25Snbeams break; // TODO: Not implemented 5517d8d0e25Snbeams } 5527d8d0e25Snbeams } 5537d8d0e25Snbeams code << "\n // -- Output fields --\n"; 5549e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 5557d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 5569e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 557e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 5587d8d0e25Snbeams // Basis action 5599e201c85SYohann switch (eval_mode) { 5607d8d0e25Snbeams case CEED_EVAL_NONE: 5619e201c85SYohann code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"];\n"; 5627d8d0e25Snbeams break; // No action 5637d8d0e25Snbeams case CEED_EVAL_INTERP: 5649e201c85SYohann code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"];\n"; 5657d8d0e25Snbeams break; 5667d8d0e25Snbeams case CEED_EVAL_GRAD: 5679e201c85SYohann code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"*dim];\n"; 5687d8d0e25Snbeams break; 5697d8d0e25Snbeams case CEED_EVAL_WEIGHT: 5707d8d0e25Snbeams break; // Should not occur 5717d8d0e25Snbeams case CEED_EVAL_DIV: 5727d8d0e25Snbeams break; // TODO: Not implemented 5737d8d0e25Snbeams case CEED_EVAL_CURL: 5747d8d0e25Snbeams break; // TODO: Not implemented 5757d8d0e25Snbeams } 5767d8d0e25Snbeams } 5777d8d0e25Snbeams } else { 5789e201c85SYohann code << "\n // Note: Using full elements\n"; 5797d8d0e25Snbeams code << " // -- Input fields --\n"; 5809e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 5817d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 5829e201c85SYohann code << " CeedScalar* r_q_"<<i<<" = r_t_"<<i<<";\n"; 5837d8d0e25Snbeams } 5847d8d0e25Snbeams code << " // -- Output fields --\n"; 5859e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 5867d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 5879e201c85SYohann code << " CeedScalar* r_qq_"<<i<<" = r_tt_"<<i<<";\n"; 5887d8d0e25Snbeams } 5897d8d0e25Snbeams } 5907d8d0e25Snbeams code << "\n // -- QFunction Inputs and outputs --\n"; 5919e201c85SYohann code << " CeedScalar* in["<<num_input_fields<<"];\n"; 5929e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 5937d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 5949e201c85SYohann code << " in["<<i<<"] = r_q_"<<i<<";\n"; 5957d8d0e25Snbeams } 5969e201c85SYohann code << " CeedScalar* out["<<num_output_fields<<"];\n"; 5979e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 5987d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 5999e201c85SYohann code << " out["<<i<<"] = r_qq_"<<i<<";\n"; 6007d8d0e25Snbeams } 6017d8d0e25Snbeams code << "\n // -- Apply QFunction --\n"; 6029e201c85SYohann code << " "<<q_function_name<<"(ctx, "; 6039e201c85SYohann if (dim != 3 || use_collograd_parallelization) { 6047d8d0e25Snbeams code << "1"; 6057d8d0e25Snbeams } else { 6069e201c85SYohann code << "Q_1d"; 6077d8d0e25Snbeams } 6087d8d0e25Snbeams code << ", in, out);\n"; 6099e201c85SYohann if (use_collograd_parallelization) { 6107d8d0e25Snbeams code << " // -- Output fields --\n"; 6119e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 6127d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 6139e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 614e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6157d8d0e25Snbeams // Basis action 6169e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 6179e201c85SYohann switch (eval_mode) { 6187d8d0e25Snbeams case CEED_EVAL_NONE: 6199e201c85SYohann code << " for (CeedInt j = 0; j < num_comp_out_"<<i<<" ; ++j) {\n"; 6209e201c85SYohann code << " r_tt_"<<i<<"[q + j*Q_1d] = r_qq_"<<i<<"[j];\n"; 6217d8d0e25Snbeams code << " }\n"; 6227d8d0e25Snbeams break; // No action 6237d8d0e25Snbeams case CEED_EVAL_INTERP: 6249e201c85SYohann code << " for (CeedInt j = 0; j < num_comp_out_"<<i<<" ; ++j) {\n"; 6259e201c85SYohann code << " r_tt_"<<i<<"[q + j*Q_1d] = r_qq_"<<i<<"[j];\n"; 6267d8d0e25Snbeams code << " }\n"; 6277d8d0e25Snbeams break; 6287d8d0e25Snbeams case CEED_EVAL_GRAD: 6299e201c85SYohann code << " gradColloTranspose3d<num_comp_out_"<<i<<",Q_1d>(data, q, r_qq_"<<i<<", s_G_out_"<<i<<", r_tt_"<<i<<");\n"; 6307d8d0e25Snbeams break; 6317d8d0e25Snbeams case CEED_EVAL_WEIGHT: 6327d8d0e25Snbeams break; // Should not occur 6337d8d0e25Snbeams case CEED_EVAL_DIV: 6347d8d0e25Snbeams break; // TODO: Not implemented 6357d8d0e25Snbeams case CEED_EVAL_CURL: 6367d8d0e25Snbeams break; // TODO: Not implemented 6377d8d0e25Snbeams } 6387d8d0e25Snbeams } 6397d8d0e25Snbeams code << " }\n"; 6407d8d0e25Snbeams } 6417d8d0e25Snbeams 6427d8d0e25Snbeams // Output basis apply if needed 6437d8d0e25Snbeams // Generate the correct eval mode code for each output 6447d8d0e25Snbeams code << "\n // -- Output field basis action and restrictions --\n"; 6459e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 6467d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 6479e201c85SYohann // Get elem_size, eval_mode, num_comp 6489e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict); 649e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6509e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 651e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6529e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 653e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6549e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 655e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6567d8d0e25Snbeams // Basis action 6579e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 6589e201c85SYohann switch (eval_mode) { 6597d8d0e25Snbeams case CEED_EVAL_NONE: 6609e201c85SYohann code << " CeedScalar* r_v_"<<i<<" = r_tt_"<<i<<";\n"; 6617d8d0e25Snbeams break; // No action 6627d8d0e25Snbeams case CEED_EVAL_INTERP: 6639e201c85SYohann code << " CeedScalar r_v_"<<i<<"[num_comp_out_"<<i<<"*P_out_"<<i<<"];\n"; 6649e201c85SYohann code << " InterpTranspose"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", r_v_"<<i<<");\n"; 6657d8d0e25Snbeams break; 6667d8d0e25Snbeams case CEED_EVAL_GRAD: 6679e201c85SYohann code << " CeedScalar r_v_"<<i<<"[num_comp_out_"<<i<<"*P_out_"<<i<<"];\n"; 6689e201c85SYohann if (use_collograd_parallelization) { 6699e201c85SYohann code << " InterpTranspose"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", r_v_"<<i<<");\n"; 6707d8d0e25Snbeams } else { 6719e201c85SYohann CeedInt P_1d; 6729e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 6739e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 6749e201c85SYohann code << " GradTranspose"<<(dim>1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v_"<<i<<");\n"; 6757d8d0e25Snbeams } 6767d8d0e25Snbeams break; 6777d8d0e25Snbeams // LCOV_EXCL_START 6787d8d0e25Snbeams case CEED_EVAL_WEIGHT: { 6797d8d0e25Snbeams Ceed ceed; 680e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 681e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 6827d8d0e25Snbeams "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 6837d8d0e25Snbeams break; // Should not occur 6847d8d0e25Snbeams } 6857d8d0e25Snbeams case CEED_EVAL_DIV: 6867d8d0e25Snbeams break; // TODO: Not implemented 6877d8d0e25Snbeams case CEED_EVAL_CURL: 6887d8d0e25Snbeams break; // TODO: Not implemented 6897d8d0e25Snbeams // LCOV_EXCL_STOP 6907d8d0e25Snbeams } 6917d8d0e25Snbeams // Restriction 6929e201c85SYohann bool is_strided; 6939e201c85SYohann ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 6949e201c85SYohann if (!is_strided) { 6957d8d0e25Snbeams ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 696e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6977d8d0e25Snbeams code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 6989e201c85SYohann CeedInt comp_stride; 6999e201c85SYohann ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 7009e201c85SYohann code << " // CompStride: "<<comp_stride<<"\n"; 701e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 7029e201c85SYohann data->indices.outputs[i] = restr_data->d_ind; 7039e201c85SYohann code << " writeDofsOffset"<<dim<<"d<num_comp_out_"<<i<<", "<<comp_stride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.outputs["<<i<<"], r_v_"<<i<<", d_v_"<<i<<");\n"; 7047d8d0e25Snbeams } else { 7059e201c85SYohann bool has_backend_strides; 7069e201c85SYohann ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); 707e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 7089e201c85SYohann CeedInt num_elem; 7099e201c85SYohann ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 710e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 7119e201c85SYohann CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 7129e201c85SYohann if (!has_backend_strides) { 7137d8d0e25Snbeams ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 714e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 7157d8d0e25Snbeams } 7167d8d0e25Snbeams code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 7179e201c85SYohann code << " writeDofsStrided"<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v_"<<i<<", d_v_"<<i<<");\n"; 7187d8d0e25Snbeams } 7197d8d0e25Snbeams } 7207d8d0e25Snbeams 7217d8d0e25Snbeams code << " }\n"; 7227d8d0e25Snbeams code << "}\n"; 7237d8d0e25Snbeams code << "// -----------------------------------------------------------------------------\n\n"; 7247d8d0e25Snbeams 7257d8d0e25Snbeams // View kernel for debugging 72646dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 7273f21f6b1SJeremy L Thompson CeedDebug(ceed, code.str().c_str()); 7287d8d0e25Snbeams 729539ec17dSJeremy L Thompson CeedInt block_sizes[3] = {0, 0, 0}; 7309e201c85SYohann CeedInt num_elem; 7319e201c85SYohann ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr); 7329e201c85SYohann ierr = BlockGridCalculate_Hip_gen(dim, num_elem, data->max_P_1d, Q_1d, block_sizes); 733b3e1519bSnbeams CeedChkBackend(ierr); 734b3e1519bSnbeams ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2, 7359e201c85SYohann "T_1D", block_sizes[0], 736b3e1519bSnbeams "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]); 737e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 7389e201c85SYohann ierr = CeedGetKernelHip(ceed, data->module, operator_name.c_str(), &data->op); 739e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 7407d8d0e25Snbeams 741e15f9bd0SJeremy L Thompson ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 742e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 7437d8d0e25Snbeams } 7447d8d0e25Snbeams //------------------------------------------------------------------------------ 745