13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 37d8d0e25Snbeams // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 57d8d0e25Snbeams // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 73d576824SJeremy L Thompson 87d8d0e25Snbeams #define CEED_DEBUG_COLOR 12 97d8d0e25Snbeams 10ec3da8bcSJed Brown #include <ceed/ceed.h> 11ec3da8bcSJed Brown #include <ceed/backend.h> 12*9e201c85SYohann #include <ceed/jit-tools.h> 137d8d0e25Snbeams #include <iostream> 143d576824SJeremy L Thompson #include <string> 157d8d0e25Snbeams #include <sstream> 163d576824SJeremy L Thompson #include "ceed-hip-gen.h" 170d0321e0SJeremy L Thompson #include "../hip-ref/ceed-hip-ref.h" 187d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h" 197d8d0e25Snbeams #include "../hip/ceed-hip-compile.h" 207d8d0e25Snbeams 21b3e1519bSnbeams 22b3e1519bSnbeams //------------------------------------------------------------------------------ 23b3e1519bSnbeams // Calculate the block size used for launching the operator kernel 24b3e1519bSnbeams //------------------------------------------------------------------------------ 25*9e201c85SYohann extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, 26*9e201c85SYohann const CeedInt P_1d, const CeedInt Q_1d, 27b3e1519bSnbeams CeedInt *block_sizes) { 28b3e1519bSnbeams 29*9e201c85SYohann const CeedInt thread1d = CeedIntMax(Q_1d, P_1d); 30b3e1519bSnbeams if (dim==1) { 31*9e201c85SYohann CeedInt elems_per_block = 64*thread1d > 256? 256/thread1d : 64; 32*9e201c85SYohann elems_per_block = elems_per_block>0?elems_per_block:1; 33b3e1519bSnbeams block_sizes[0] = thread1d; 34b3e1519bSnbeams block_sizes[1] = 1; 35*9e201c85SYohann block_sizes[2] = elems_per_block; 36b3e1519bSnbeams } else if (dim==2) { 37*9e201c85SYohann const CeedInt elems_per_block = thread1d<4? 16 : 2; 38b3e1519bSnbeams block_sizes[0] = thread1d; 39b3e1519bSnbeams block_sizes[1] = thread1d; 40*9e201c85SYohann block_sizes[2] = elems_per_block; 41b3e1519bSnbeams } else if (dim==3) { 42*9e201c85SYohann const CeedInt elems_per_block = thread1d<6? 4 : (thread1d<8? 2 : 1); 43b3e1519bSnbeams block_sizes[0] = thread1d; 44b3e1519bSnbeams block_sizes[1] = thread1d; 45*9e201c85SYohann block_sizes[2] = elems_per_block; 46b3e1519bSnbeams } 47b3e1519bSnbeams return CEED_ERROR_SUCCESS; 48b3e1519bSnbeams } 49b3e1519bSnbeams 507d8d0e25Snbeams //------------------------------------------------------------------------------ 51*9e201c85SYohann // Build single operator kernel 527d8d0e25Snbeams //------------------------------------------------------------------------------ 5337c3b1cfSnbeams extern "C" int CeedHipGenOperatorBuild(CeedOperator op) { 547d8d0e25Snbeams 557d8d0e25Snbeams using std::ostringstream; 567d8d0e25Snbeams using std::string; 577d8d0e25Snbeams int ierr; 58*9e201c85SYohann bool is_setup_done; 59*9e201c85SYohann ierr = CeedOperatorIsSetupDone(op, &is_setup_done); CeedChkBackend(ierr); 60*9e201c85SYohann if (is_setup_done) return CEED_ERROR_SUCCESS; 617d8d0e25Snbeams Ceed ceed; 62e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 637d8d0e25Snbeams CeedOperator_Hip_gen *data; 64e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 657d8d0e25Snbeams CeedQFunction qf; 667d8d0e25Snbeams CeedQFunction_Hip_gen *qf_data; 67e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 68e15f9bd0SJeremy L Thompson ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 69e79b91d9SJeremy L Thompson CeedSize lsize; 70*9e201c85SYohann CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, 71*9e201c85SYohann num_output_fields, num_comp, dim = 1; 72e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 73*9e201c85SYohann Q_1d = Q; 74*9e201c85SYohann CeedOperatorField *op_input_fields, *op_output_fields; 75*9e201c85SYohann ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields); 76e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 77*9e201c85SYohann CeedQFunctionField *qf_input_fields, *qf_output_fields; 78*9e201c85SYohann ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields); 79e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 80*9e201c85SYohann CeedEvalMode eval_mode; 817d8d0e25Snbeams CeedBasis basis; 827d8d0e25Snbeams CeedBasis_Hip_shared *basis_data; 837d8d0e25Snbeams CeedElemRestriction Erestrict; 847d8d0e25Snbeams CeedElemRestriction_Hip *restr_data; 857d8d0e25Snbeams 860b454692Sjeremylt // Check for restriction only identity operator 870b454692Sjeremylt bool is_identity_qf; 880b454692Sjeremylt ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 890b454692Sjeremylt if (is_identity_qf) { 90*9e201c85SYohann CeedEvalMode eval_mode_in, eval_mode_out; 91*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in); CeedChkBackend(ierr); 92*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out); CeedChkBackend(ierr); 93*9e201c85SYohann if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) 940b454692Sjeremylt // LCOV_EXCL_START 950b454692Sjeremylt return CeedError(ceed, CEED_ERROR_BACKEND, 960b454692Sjeremylt "Backend does not implement restriction only identity operators"); 970b454692Sjeremylt // LCOV_EXCL_STOP 980b454692Sjeremylt } 990b454692Sjeremylt 1007d8d0e25Snbeams ostringstream code; 101*9e201c85SYohann // TODO: generalize to accept different device functions? 102*9e201c85SYohann { 103*9e201c85SYohann char *tensor_basis_kernel_path, *tensor_basis_kernel_source; 104*9e201c85SYohann ierr = CeedGetJitAbsolutePath(ceed, 105*9e201c85SYohann "ceed/jit-source/hip/hip-shared-basis-tensor-templates.h", 106*9e201c85SYohann &tensor_basis_kernel_path); CeedChkBackend(ierr); 107*9e201c85SYohann CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n"); 108*9e201c85SYohann ierr = CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source); 109*9e201c85SYohann CeedChkBackend(ierr); 110*9e201c85SYohann code << tensor_basis_kernel_source; 111*9e201c85SYohann ierr = CeedFree(&tensor_basis_kernel_path); CeedChkBackend(ierr); 112*9e201c85SYohann ierr = CeedFree(&tensor_basis_kernel_source); CeedChkBackend(ierr); 113*9e201c85SYohann } 114*9e201c85SYohann { 115*9e201c85SYohann char *hip_gen_template_path, *hip_gen_template_source; 116*9e201c85SYohann ierr = CeedGetJitAbsolutePath(ceed, 117*9e201c85SYohann "ceed/jit-source/hip/hip-gen-templates.h", 118*9e201c85SYohann &hip_gen_template_path); CeedChkBackend(ierr); 119*9e201c85SYohann CeedDebug256(ceed, 2, "----- Loading Hip-Gen Template Source -----\n"); 120*9e201c85SYohann ierr = CeedLoadSourceToBuffer(ceed, hip_gen_template_path, &hip_gen_template_source); 121*9e201c85SYohann CeedChkBackend(ierr); 122*9e201c85SYohann code << hip_gen_template_source; 123*9e201c85SYohann ierr = CeedFree(&hip_gen_template_path); CeedChkBackend(ierr); 124*9e201c85SYohann ierr = CeedFree(&hip_gen_template_source); CeedChkBackend(ierr); 125*9e201c85SYohann } 1267d8d0e25Snbeams 127*9e201c85SYohann string q_function_source(qf_data->q_function_source); 128*9e201c85SYohann string q_function_name(qf_data->q_function_name); 129*9e201c85SYohann string operator_name; 130*9e201c85SYohann operator_name = "CeedKernel_Hip_gen_" + q_function_name; 1317d8d0e25Snbeams 132*9e201c85SYohann // Find dim, P_1d, Q_1d 133*9e201c85SYohann data->max_P_1d = 0; 134*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 135*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 1367d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 137e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 138*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 139e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1407d8d0e25Snbeams 141*9e201c85SYohann // Collect dim, P_1d, and Q_1d 142e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 1437d8d0e25Snbeams bool isTensor; 144e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 1457d8d0e25Snbeams if (isTensor) { 146*9e201c85SYohann ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 147*9e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 148*9e201c85SYohann if (P_1d > data->max_P_1d) data->max_P_1d = P_1d; 1497d8d0e25Snbeams } else { 1507d8d0e25Snbeams // LCOV_EXCL_START 151e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 1527d8d0e25Snbeams // LCOV_EXCL_STOP 1537d8d0e25Snbeams } 1547d8d0e25Snbeams } 1557d8d0e25Snbeams } 156*9e201c85SYohann // Check output bases for Q_1d, dim as well 157dc64899eSYohann // The only input basis might be CEED_BASIS_COLLOCATED 158*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 159*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 1607d8d0e25Snbeams 1617d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 162e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 163*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 164e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1657d8d0e25Snbeams 166*9e201c85SYohann // Collect Q_1d 167e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 1687d8d0e25Snbeams bool isTensor; 169e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 1707d8d0e25Snbeams if (isTensor) { 171*9e201c85SYohann ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 1727d8d0e25Snbeams } else { 1737d8d0e25Snbeams // LCOV_EXCL_START 174e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 1757d8d0e25Snbeams // LCOV_EXCL_STOP 1767d8d0e25Snbeams } 1777d8d0e25Snbeams } 1787d8d0e25Snbeams } 1797d8d0e25Snbeams data->dim = dim; 180*9e201c85SYohann data->Q_1d = Q_1d; 1817d8d0e25Snbeams 182*9e201c85SYohann // Only use 3D collocated gradient parallelization strategy when gradient is computed 183*9e201c85SYohann // TODO: put in a function? 184*9e201c85SYohann bool use_collograd_parallelization = false; 185*9e201c85SYohann if (dim == 3) { 186*9e201c85SYohann bool was_grad_found = false; 187*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 188*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 189*9e201c85SYohann if (eval_mode == CEED_EVAL_GRAD) { 190*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 191*9e201c85SYohann ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 192*9e201c85SYohann use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 193*9e201c85SYohann was_grad_found = true; 194*9e201c85SYohann } 195*9e201c85SYohann } 196*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 197*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 198*9e201c85SYohann if (eval_mode == CEED_EVAL_GRAD) { 199*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 200*9e201c85SYohann ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 201*9e201c85SYohann use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 202*9e201c85SYohann was_grad_found = true; 203*9e201c85SYohann } 204*9e201c85SYohann } 2057d8d0e25Snbeams } 2067d8d0e25Snbeams 207*9e201c85SYohann // Define CEED_Q_VLA 208*9e201c85SYohann code << "\n#undef CEED_Q_VLA\n"; 209*9e201c85SYohann if (dim != 3 || use_collograd_parallelization) { 210*9e201c85SYohann code << "#define CEED_Q_VLA 1\n\n"; 211*9e201c85SYohann } else { 212*9e201c85SYohann code << "#define CEED_Q_VLA "<<Q_1d<<"\n\n"; 213*9e201c85SYohann } 214*9e201c85SYohann 215*9e201c85SYohann code << q_function_source; 2167d8d0e25Snbeams 2177d8d0e25Snbeams // Setup 2187d8d0e25Snbeams code << "\n// -----------------------------------------------------------------------------\n"; 219b3e1519bSnbeams code << "\nextern \"C\" __launch_bounds__(BLOCK_SIZE)\n"; 220*9e201c85SYohann code << "__global__ void "<<operator_name<<"(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W) {\n"; 221*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 222*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 223e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 224*9e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 225*9e201c85SYohann code << " const CeedScalar* d_u_" <<i<<" = fields.inputs["<<i<<"];\n"; 2267d8d0e25Snbeams } 2277d8d0e25Snbeams } 2287d8d0e25Snbeams 229*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 230*9e201c85SYohann code << " CeedScalar* d_v_"<<i<<" = fields.outputs["<<i<<"];\n"; 2317d8d0e25Snbeams } 2327d8d0e25Snbeams 233*9e201c85SYohann code << " const CeedInt dim = "<<dim<<";\n"; 234*9e201c85SYohann code << " const CeedInt Q_1d = "<<Q_1d<<";\n"; 2357d8d0e25Snbeams 2367d8d0e25Snbeams code << " HIP_DYNAMIC_SHARED( CeedScalar, slice)\n"; 237*9e201c85SYohann code << " SharedData_Hip data;\n"; 238*9e201c85SYohann code << " data.t_id_x = threadIdx.x;\n"; 239*9e201c85SYohann code << " data.t_id_y = threadIdx.y;\n"; 240*9e201c85SYohann code << " data.t_id_z = threadIdx.z;\n"; 241*9e201c85SYohann code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 242*9e201c85SYohann code << " data.slice = slice+data.t_id_z*T_1D"<<(dim>1?"*T_1D":"")<<";\n"; 2437d8d0e25Snbeams 2447d8d0e25Snbeams code << "\n // -- Input field constants and basis data --\n"; 2457d8d0e25Snbeams //Initialize constants, and matrices B and G 246*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 2477d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 248*9e201c85SYohann // Get elem_size, eval_mode, num_comp 249*9e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); 250e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 251*9e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 252e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 253*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 254e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 255*9e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 256e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2577d8d0e25Snbeams 2587d8d0e25Snbeams // Set field constants 259*9e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT) { 260*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 2617d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 262*9e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 263*9e201c85SYohann code << " const CeedInt P_in_"<<i<<" = "<<P_1d<<";\n"; 2647d8d0e25Snbeams } else { 265*9e201c85SYohann code << " const CeedInt P_in_"<<i<<" = "<<Q_1d<<";\n"; 2667d8d0e25Snbeams } 267*9e201c85SYohann code << " const CeedInt num_comp_in_"<<i<<" = "<<num_comp<<";\n"; 2687d8d0e25Snbeams } 2697d8d0e25Snbeams 2707d8d0e25Snbeams // Load basis data 271*9e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 272*9e201c85SYohann switch (eval_mode) { 2737d8d0e25Snbeams case CEED_EVAL_NONE: 2747d8d0e25Snbeams break; 2757d8d0e25Snbeams case CEED_EVAL_INTERP: 276e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 277*9e201c85SYohann data->B.inputs[i] = basis_data->d_interp_1d; 278*9e201c85SYohann code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 279*9e201c85SYohann code << " loadMatrix<P_in_"<<i<<",Q_1d>(data, B.inputs["<<i<<"], s_B_in_"<<i<<");\n"; 2807d8d0e25Snbeams break; 2817d8d0e25Snbeams case CEED_EVAL_GRAD: 282e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 283*9e201c85SYohann data->B.inputs[i] = basis_data->d_interp_1d; 284*9e201c85SYohann code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 285*9e201c85SYohann code << " loadMatrix<P_in_"<<i<<",Q_1d>(data, B.inputs["<<i<<"], s_B_in_"<<i<<");\n"; 286*9e201c85SYohann if (use_collograd_parallelization) { 287*9e201c85SYohann data->G.inputs[i] = basis_data->d_collo_grad_1d; 288*9e201c85SYohann code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q_1d*Q_1d<<"];\n"; 289*9e201c85SYohann code << " loadMatrix<Q_1d,Q_1d>(data, G.inputs["<<i<<"], s_G_in_"<<i<<");\n"; 2907d8d0e25Snbeams } else { 291*9e201c85SYohann bool has_collo_grad = !!basis_data->d_collo_grad_1d; 292*9e201c85SYohann data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 293*9e201c85SYohann code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q_1d*(has_collo_grad?Q_1d:P_1d)<<"];\n"; 294*9e201c85SYohann code << " loadMatrix<"<<(has_collo_grad?"Q_1d":("P_in_"+std::to_string(i)))<<",Q_1d>(data, G.inputs["<<i<<"], s_G_in_"<<i<<");\n"; 2957d8d0e25Snbeams } 2967d8d0e25Snbeams break; 2977d8d0e25Snbeams case CEED_EVAL_WEIGHT: 2987d8d0e25Snbeams break; // No action 2997d8d0e25Snbeams case CEED_EVAL_DIV: 3007d8d0e25Snbeams break; // TODO: Not implemented 3017d8d0e25Snbeams case CEED_EVAL_CURL: 3027d8d0e25Snbeams break; // TODO: Not implemented 3037d8d0e25Snbeams } 3047d8d0e25Snbeams } 3057d8d0e25Snbeams 3067d8d0e25Snbeams code << "\n // -- Output field constants and basis data --\n"; 307*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 3087d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 309*9e201c85SYohann // Get elem_size, eval_mode, num_comp 310*9e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict); 311e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 312*9e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 313e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 314*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 315e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 316*9e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 317e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3187d8d0e25Snbeams 3197d8d0e25Snbeams // Set field constants 320*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 3217d8d0e25Snbeams if (basis != CEED_BASIS_COLLOCATED) { 322*9e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 323*9e201c85SYohann code << " const CeedInt P_out_"<<i<<" = "<<P_1d<<";\n"; 3247d8d0e25Snbeams } else { 325*9e201c85SYohann code << " const CeedInt P_out_"<<i<<" = "<<Q_1d<<";\n"; 3267d8d0e25Snbeams } 327*9e201c85SYohann code << " const CeedInt num_comp_out_"<<i<<" = "<<num_comp<<";\n"; 3287d8d0e25Snbeams 3297d8d0e25Snbeams // Load basis data 330*9e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 331*9e201c85SYohann switch (eval_mode) { 3327d8d0e25Snbeams case CEED_EVAL_NONE: 3337d8d0e25Snbeams break; // No action 3347d8d0e25Snbeams case CEED_EVAL_INTERP: 335e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 336*9e201c85SYohann data->B.outputs[i] = basis_data->d_interp_1d; 337*9e201c85SYohann code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 338*9e201c85SYohann code << " loadMatrix<P_out_"<<i<<",Q_1d>(data, B.outputs["<<i<<"], s_B_out_"<<i<<");\n"; 3397d8d0e25Snbeams break; 3407d8d0e25Snbeams case CEED_EVAL_GRAD: 341e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 342*9e201c85SYohann data->B.outputs[i] = basis_data->d_interp_1d; 343*9e201c85SYohann code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 344*9e201c85SYohann code << " loadMatrix<P_out_"<<i<<",Q_1d>(data, B.outputs["<<i<<"], s_B_out_"<<i<<");\n"; 345*9e201c85SYohann if (use_collograd_parallelization) { 346*9e201c85SYohann data->G.outputs[i] = basis_data->d_collo_grad_1d; 347*9e201c85SYohann code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q_1d*Q_1d<<"];\n"; 348*9e201c85SYohann code << " loadMatrix<Q_1d,Q_1d>(data, G.outputs["<<i<<"], s_G_out_"<<i<<");\n"; 3497d8d0e25Snbeams } else { 350*9e201c85SYohann bool has_collo_grad = !!basis_data->d_collo_grad_1d; 351*9e201c85SYohann data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 352*9e201c85SYohann code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q_1d*(has_collo_grad?Q_1d:P_1d)<<"];\n"; 353*9e201c85SYohann code << " loadMatrix<"<<(has_collo_grad?"Q_1d":("P_out_"+std::to_string(i)))<<",Q_1d>(data, G.outputs["<<i<<"], s_G_out_"<<i<<");\n"; 3547d8d0e25Snbeams } 3557d8d0e25Snbeams break; 3567d8d0e25Snbeams // LCOV_EXCL_START 3577d8d0e25Snbeams case CEED_EVAL_WEIGHT: { 3587d8d0e25Snbeams Ceed ceed; 359e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 360e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 3617d8d0e25Snbeams "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 3627d8d0e25Snbeams break; // Should not occur 3637d8d0e25Snbeams } 3647d8d0e25Snbeams case CEED_EVAL_DIV: 3657d8d0e25Snbeams break; // TODO: Not implemented 3667d8d0e25Snbeams case CEED_EVAL_CURL: 3677d8d0e25Snbeams break; // TODO: Not implemented 3687d8d0e25Snbeams // LCOV_EXCL_STOP 3697d8d0e25Snbeams } 3707d8d0e25Snbeams } 3717d8d0e25Snbeams code << "\n // -- Element loop --\n"; 3727d8d0e25Snbeams code << " __syncthreads();\n"; 373*9e201c85SYohann code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 3747d8d0e25Snbeams // Input basis apply if needed 3757d8d0e25Snbeams // Generate the correct eval mode code for each input 3767d8d0e25Snbeams code << " // -- Input field restrictions and basis actions --\n"; 377*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 3787d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 379*9e201c85SYohann // Get elem_size, eval_mode, num_comp 380*9e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); 381e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 382*9e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 383e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 384*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 385e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 386*9e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 387e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3887d8d0e25Snbeams 3897d8d0e25Snbeams // Restriction 390*9e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT && 391*9e201c85SYohann !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { 392*9e201c85SYohann code << " CeedScalar r_u_"<<i<<"[num_comp_in_"<<i<<"*P_in_"<<i<<"];\n"; 3937d8d0e25Snbeams 394*9e201c85SYohann bool is_strided; 395*9e201c85SYohann ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 396*9e201c85SYohann if (!is_strided) { 3977d8d0e25Snbeams ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 398e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3997d8d0e25Snbeams code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 400*9e201c85SYohann CeedInt comp_stride; 401*9e201c85SYohann ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 402*9e201c85SYohann code << " // CompStride: "<<comp_stride<<"\n"; 403e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 404*9e201c85SYohann data->indices.inputs[i] = restr_data->d_ind; 405*9e201c85SYohann code << " readDofsOffset"<<dim<<"d<num_comp_in_"<<i<<", "<<comp_stride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.inputs["<<i<<"], d_u_"<<i<<", r_u_"<<i<<");\n"; 4067d8d0e25Snbeams } else { 407*9e201c85SYohann bool has_backend_strides; 408*9e201c85SYohann ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); 409e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 410*9e201c85SYohann CeedInt num_elem; 411*9e201c85SYohann ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 412e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 413*9e201c85SYohann CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 414*9e201c85SYohann if (!has_backend_strides) { 4157d8d0e25Snbeams ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 416e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 4177d8d0e25Snbeams } 4187d8d0e25Snbeams code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 419*9e201c85SYohann code << " readDofsStrided"<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u_"<<i<<", r_u_"<<i<<");\n"; 4207d8d0e25Snbeams } 4217d8d0e25Snbeams } 4227d8d0e25Snbeams 4237d8d0e25Snbeams // Basis action 424*9e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 425*9e201c85SYohann switch (eval_mode) { 4267d8d0e25Snbeams case CEED_EVAL_NONE: 427*9e201c85SYohann if (!use_collograd_parallelization) { 428*9e201c85SYohann code << " CeedScalar* r_t_"<<i<<" = r_u_"<<i<<";\n"; 4297d8d0e25Snbeams } 4307d8d0e25Snbeams break; 4317d8d0e25Snbeams case CEED_EVAL_INTERP: 432*9e201c85SYohann code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*Q_1d];\n"; 433*9e201c85SYohann code << " Interp"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", r_t_"<<i<<");\n"; 4347d8d0e25Snbeams break; 4357d8d0e25Snbeams case CEED_EVAL_GRAD: 436*9e201c85SYohann if (use_collograd_parallelization) { 437*9e201c85SYohann code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*Q_1d];\n"; 438*9e201c85SYohann code << " Interp"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", r_t_"<<i<<");\n"; 4397d8d0e25Snbeams } else { 440*9e201c85SYohann CeedInt P_1d; 441*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 442*9e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 443*9e201c85SYohann code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*dim*Q_1d];\n"; 444*9e201c85SYohann code << " Grad"<<(dim>1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t_"<<i<<");\n"; 4457d8d0e25Snbeams } 4467d8d0e25Snbeams break; 4477d8d0e25Snbeams case CEED_EVAL_WEIGHT: 448*9e201c85SYohann code << " CeedScalar r_t_"<<i<<"[Q_1d];\n"; 449*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 450e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 451437930d1SJeremy L Thompson data->W = basis_data->d_q_weight_1d; 452*9e201c85SYohann code << " Weight"<<(dim>1?"Tensor":"")<<dim<<"d<Q_1d>(data, W, r_t_"<<i<<");\n"; 4537d8d0e25Snbeams break; // No action 4547d8d0e25Snbeams case CEED_EVAL_DIV: 4557d8d0e25Snbeams break; // TODO: Not implemented 4567d8d0e25Snbeams case CEED_EVAL_CURL: 4577d8d0e25Snbeams break; // TODO: Not implemented 4587d8d0e25Snbeams } 4597d8d0e25Snbeams } 4607d8d0e25Snbeams 4617d8d0e25Snbeams // Q function 4627d8d0e25Snbeams code << "\n // -- Output field setup --\n"; 463*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 4647d8d0e25Snbeams code << "\n // ---- Output field "<<i<<" ----\n"; 465*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 466e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 467*9e201c85SYohann if (eval_mode==CEED_EVAL_GRAD) 4687d8d0e25Snbeams { 469*9e201c85SYohann if (use_collograd_parallelization) { 4707d8d0e25Snbeams //Accumulator for gradient slices 471*9e201c85SYohann code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*Q_1d];\n"; 472*9e201c85SYohann code << " for (CeedInt i = 0; i < num_comp_out_"<<i<<"; i++) {\n"; 473*9e201c85SYohann code << " for (CeedInt j = 0; j < Q_1d; ++j) {\n"; 474*9e201c85SYohann code << " r_tt_"<<i<<"[j + i*Q_1d] = 0.0;\n"; 4757d8d0e25Snbeams code << " }\n"; 4767d8d0e25Snbeams code << " }\n"; 4777d8d0e25Snbeams } else { 478*9e201c85SYohann code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*dim*Q_1d];\n"; 4797d8d0e25Snbeams } 4807d8d0e25Snbeams } 481*9e201c85SYohann if (eval_mode==CEED_EVAL_NONE || eval_mode==CEED_EVAL_INTERP) 4827d8d0e25Snbeams { 483*9e201c85SYohann code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*Q_1d];\n"; 4847d8d0e25Snbeams } 4857d8d0e25Snbeams } 4867d8d0e25Snbeams // We treat quadrature points per slice in 3d to save registers 487*9e201c85SYohann if (use_collograd_parallelization) { 488*9e201c85SYohann code << "\n // Note: Using planes of 3D elements\n"; 4897d8d0e25Snbeams code << "#pragma unroll\n"; 490*9e201c85SYohann code << " for (CeedInt q = 0; q < Q_1d; q++) {\n"; 4917d8d0e25Snbeams code << " // -- Input fields --\n"; 492*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 4937d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 494*9e201c85SYohann // Get elem_size, eval_mode, num_comp 495*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 496e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 4977d8d0e25Snbeams // Basis action 498*9e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 499*9e201c85SYohann switch (eval_mode) { 5007d8d0e25Snbeams case CEED_EVAL_NONE: 501*9e201c85SYohann code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"];\n"; 5027d8d0e25Snbeams 503*9e201c85SYohann bool is_strided; 504*9e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); CeedChkBackend(ierr); 505*9e201c85SYohann ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 506*9e201c85SYohann if (!is_strided) { 5077d8d0e25Snbeams ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 508e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 5097d8d0e25Snbeams code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 510*9e201c85SYohann CeedInt comp_stride; 511*9e201c85SYohann ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 512*9e201c85SYohann code << " // CompStride: "<<comp_stride<<"\n"; 513e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 514*9e201c85SYohann data->indices.inputs[i] = restr_data->d_ind; 515*9e201c85SYohann code << " readSliceQuadsOffset"<<"3d<num_comp_in_"<<i<<", "<<comp_stride<<", Q_1d>(data, lsize_in_"<<i<<", elem, q, indices.inputs["<<i<<"], d_u_"<<i<<", r_q_"<<i<<");\n"; 5167d8d0e25Snbeams } else { 517*9e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); CeedChkBackend(ierr); 518*9e201c85SYohann bool has_backend_strides; 519*9e201c85SYohann ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); 520e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 521*9e201c85SYohann CeedInt num_elem; 522*9e201c85SYohann ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 523e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 524*9e201c85SYohann CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 525*9e201c85SYohann if (!has_backend_strides) { 5267d8d0e25Snbeams ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 527e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 5287d8d0e25Snbeams } 5297d8d0e25Snbeams code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 530*9e201c85SYohann code << " readSliceQuadsStrided"<<"3d<num_comp_in_"<<i<<",Q_1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u_"<<i<<", r_q_"<<i<<");\n"; 5317d8d0e25Snbeams } 5327d8d0e25Snbeams break; 5337d8d0e25Snbeams case CEED_EVAL_INTERP: 534*9e201c85SYohann code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"];\n"; 535*9e201c85SYohann code << " for (CeedInt j = 0; j < num_comp_in_"<<i<<" ; ++j) {\n"; 536*9e201c85SYohann code << " r_q_"<<i<<"[j] = r_t_"<<i<<"[q + j*Q_1d];\n"; 5377d8d0e25Snbeams code << " }\n"; 5387d8d0e25Snbeams break; 5397d8d0e25Snbeams case CEED_EVAL_GRAD: 540*9e201c85SYohann code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"*dim];\n"; 541*9e201c85SYohann code << " gradCollo3d<num_comp_in_"<<i<<",Q_1d>(data, q, r_t_"<<i<<", s_G_in_"<<i<<", r_q_"<<i<<");\n"; 5427d8d0e25Snbeams break; 5437d8d0e25Snbeams case CEED_EVAL_WEIGHT: 544*9e201c85SYohann code << " CeedScalar r_q_"<<i<<"[1];\n"; 545*9e201c85SYohann code << " r_q_"<<i<<"[0] = r_t_"<<i<<"[q];\n"; 5467d8d0e25Snbeams break; // No action 5477d8d0e25Snbeams case CEED_EVAL_DIV: 5487d8d0e25Snbeams break; // TODO: Not implemented 5497d8d0e25Snbeams case CEED_EVAL_CURL: 5507d8d0e25Snbeams break; // TODO: Not implemented 5517d8d0e25Snbeams } 5527d8d0e25Snbeams } 5537d8d0e25Snbeams code << "\n // -- Output fields --\n"; 554*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 5557d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 556*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 557e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 5587d8d0e25Snbeams // Basis action 559*9e201c85SYohann switch (eval_mode) { 5607d8d0e25Snbeams case CEED_EVAL_NONE: 561*9e201c85SYohann code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"];\n"; 5627d8d0e25Snbeams break; // No action 5637d8d0e25Snbeams case CEED_EVAL_INTERP: 564*9e201c85SYohann code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"];\n"; 5657d8d0e25Snbeams break; 5667d8d0e25Snbeams case CEED_EVAL_GRAD: 567*9e201c85SYohann code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"*dim];\n"; 5687d8d0e25Snbeams break; 5697d8d0e25Snbeams case CEED_EVAL_WEIGHT: 5707d8d0e25Snbeams break; // Should not occur 5717d8d0e25Snbeams case CEED_EVAL_DIV: 5727d8d0e25Snbeams break; // TODO: Not implemented 5737d8d0e25Snbeams case CEED_EVAL_CURL: 5747d8d0e25Snbeams break; // TODO: Not implemented 5757d8d0e25Snbeams } 5767d8d0e25Snbeams } 5777d8d0e25Snbeams } else { 578*9e201c85SYohann code << "\n // Note: Using full elements\n"; 5797d8d0e25Snbeams code << " // -- Input fields --\n"; 580*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 5817d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 582*9e201c85SYohann code << " CeedScalar* r_q_"<<i<<" = r_t_"<<i<<";\n"; 5837d8d0e25Snbeams } 5847d8d0e25Snbeams code << " // -- Output fields --\n"; 585*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 5867d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 587*9e201c85SYohann code << " CeedScalar* r_qq_"<<i<<" = r_tt_"<<i<<";\n"; 5887d8d0e25Snbeams } 5897d8d0e25Snbeams } 5907d8d0e25Snbeams code << "\n // -- QFunction Inputs and outputs --\n"; 591*9e201c85SYohann code << " CeedScalar* in["<<num_input_fields<<"];\n"; 592*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 5937d8d0e25Snbeams code << " // ---- Input field "<<i<<" ----\n"; 594*9e201c85SYohann code << " in["<<i<<"] = r_q_"<<i<<";\n"; 5957d8d0e25Snbeams } 596*9e201c85SYohann code << " CeedScalar* out["<<num_output_fields<<"];\n"; 597*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 5987d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 599*9e201c85SYohann code << " out["<<i<<"] = r_qq_"<<i<<";\n"; 6007d8d0e25Snbeams } 6017d8d0e25Snbeams code << "\n // -- Apply QFunction --\n"; 602*9e201c85SYohann code << " "<<q_function_name<<"(ctx, "; 603*9e201c85SYohann if (dim != 3 || use_collograd_parallelization) { 6047d8d0e25Snbeams code << "1"; 6057d8d0e25Snbeams } else { 606*9e201c85SYohann code << "Q_1d"; 6077d8d0e25Snbeams } 6087d8d0e25Snbeams code << ", in, out);\n"; 609*9e201c85SYohann if (use_collograd_parallelization) { 6107d8d0e25Snbeams code << " // -- Output fields --\n"; 611*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 6127d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 613*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 614e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6157d8d0e25Snbeams // Basis action 616*9e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 617*9e201c85SYohann switch (eval_mode) { 6187d8d0e25Snbeams case CEED_EVAL_NONE: 619*9e201c85SYohann code << " for (CeedInt j = 0; j < num_comp_out_"<<i<<" ; ++j) {\n"; 620*9e201c85SYohann code << " r_tt_"<<i<<"[q + j*Q_1d] = r_qq_"<<i<<"[j];\n"; 6217d8d0e25Snbeams code << " }\n"; 6227d8d0e25Snbeams break; // No action 6237d8d0e25Snbeams case CEED_EVAL_INTERP: 624*9e201c85SYohann code << " for (CeedInt j = 0; j < num_comp_out_"<<i<<" ; ++j) {\n"; 625*9e201c85SYohann code << " r_tt_"<<i<<"[q + j*Q_1d] = r_qq_"<<i<<"[j];\n"; 6267d8d0e25Snbeams code << " }\n"; 6277d8d0e25Snbeams break; 6287d8d0e25Snbeams case CEED_EVAL_GRAD: 629*9e201c85SYohann code << " gradColloTranspose3d<num_comp_out_"<<i<<",Q_1d>(data, q, r_qq_"<<i<<", s_G_out_"<<i<<", r_tt_"<<i<<");\n"; 6307d8d0e25Snbeams break; 6317d8d0e25Snbeams case CEED_EVAL_WEIGHT: 6327d8d0e25Snbeams break; // Should not occur 6337d8d0e25Snbeams case CEED_EVAL_DIV: 6347d8d0e25Snbeams break; // TODO: Not implemented 6357d8d0e25Snbeams case CEED_EVAL_CURL: 6367d8d0e25Snbeams break; // TODO: Not implemented 6377d8d0e25Snbeams } 6387d8d0e25Snbeams } 6397d8d0e25Snbeams code << " }\n"; 6407d8d0e25Snbeams } 6417d8d0e25Snbeams 6427d8d0e25Snbeams // Output basis apply if needed 6437d8d0e25Snbeams // Generate the correct eval mode code for each output 6447d8d0e25Snbeams code << "\n // -- Output field basis action and restrictions --\n"; 645*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 6467d8d0e25Snbeams code << " // ---- Output field "<<i<<" ----\n"; 647*9e201c85SYohann // Get elem_size, eval_mode, num_comp 648*9e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict); 649e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 650*9e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 651e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 652*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 653e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 654*9e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 655e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6567d8d0e25Snbeams // Basis action 657*9e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 658*9e201c85SYohann switch (eval_mode) { 6597d8d0e25Snbeams case CEED_EVAL_NONE: 660*9e201c85SYohann code << " CeedScalar* r_v_"<<i<<" = r_tt_"<<i<<";\n"; 6617d8d0e25Snbeams break; // No action 6627d8d0e25Snbeams case CEED_EVAL_INTERP: 663*9e201c85SYohann code << " CeedScalar r_v_"<<i<<"[num_comp_out_"<<i<<"*P_out_"<<i<<"];\n"; 664*9e201c85SYohann code << " InterpTranspose"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", r_v_"<<i<<");\n"; 6657d8d0e25Snbeams break; 6667d8d0e25Snbeams case CEED_EVAL_GRAD: 667*9e201c85SYohann code << " CeedScalar r_v_"<<i<<"[num_comp_out_"<<i<<"*P_out_"<<i<<"];\n"; 668*9e201c85SYohann if (use_collograd_parallelization) { 669*9e201c85SYohann code << " InterpTranspose"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", r_v_"<<i<<");\n"; 6707d8d0e25Snbeams } else { 671*9e201c85SYohann CeedInt P_1d; 672*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 673*9e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 674*9e201c85SYohann code << " GradTranspose"<<(dim>1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v_"<<i<<");\n"; 6757d8d0e25Snbeams } 6767d8d0e25Snbeams break; 6777d8d0e25Snbeams // LCOV_EXCL_START 6787d8d0e25Snbeams case CEED_EVAL_WEIGHT: { 6797d8d0e25Snbeams Ceed ceed; 680e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 681e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 6827d8d0e25Snbeams "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 6837d8d0e25Snbeams break; // Should not occur 6847d8d0e25Snbeams } 6857d8d0e25Snbeams case CEED_EVAL_DIV: 6867d8d0e25Snbeams break; // TODO: Not implemented 6877d8d0e25Snbeams case CEED_EVAL_CURL: 6887d8d0e25Snbeams break; // TODO: Not implemented 6897d8d0e25Snbeams // LCOV_EXCL_STOP 6907d8d0e25Snbeams } 6917d8d0e25Snbeams // Restriction 692*9e201c85SYohann bool is_strided; 693*9e201c85SYohann ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 694*9e201c85SYohann if (!is_strided) { 6957d8d0e25Snbeams ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 696e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6977d8d0e25Snbeams code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 698*9e201c85SYohann CeedInt comp_stride; 699*9e201c85SYohann ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 700*9e201c85SYohann code << " // CompStride: "<<comp_stride<<"\n"; 701e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 702*9e201c85SYohann data->indices.outputs[i] = restr_data->d_ind; 703*9e201c85SYohann code << " writeDofsOffset"<<dim<<"d<num_comp_out_"<<i<<", "<<comp_stride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.outputs["<<i<<"], r_v_"<<i<<", d_v_"<<i<<");\n"; 7047d8d0e25Snbeams } else { 705*9e201c85SYohann bool has_backend_strides; 706*9e201c85SYohann ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); 707e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 708*9e201c85SYohann CeedInt num_elem; 709*9e201c85SYohann ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 710e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 711*9e201c85SYohann CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 712*9e201c85SYohann if (!has_backend_strides) { 7137d8d0e25Snbeams ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 714e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 7157d8d0e25Snbeams } 7167d8d0e25Snbeams code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 717*9e201c85SYohann code << " writeDofsStrided"<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v_"<<i<<", d_v_"<<i<<");\n"; 7187d8d0e25Snbeams } 7197d8d0e25Snbeams } 7207d8d0e25Snbeams 7217d8d0e25Snbeams code << " }\n"; 7227d8d0e25Snbeams code << "}\n"; 7237d8d0e25Snbeams code << "// -----------------------------------------------------------------------------\n\n"; 7247d8d0e25Snbeams 7257d8d0e25Snbeams // View kernel for debugging 72646dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 7273f21f6b1SJeremy L Thompson CeedDebug(ceed, code.str().c_str()); 7287d8d0e25Snbeams 729539ec17dSJeremy L Thompson CeedInt block_sizes[3] = {0, 0, 0}; 730*9e201c85SYohann CeedInt num_elem; 731*9e201c85SYohann ierr = CeedOperatorGetNumElements(op, &num_elem); CeedChkBackend(ierr); 732*9e201c85SYohann ierr = BlockGridCalculate_Hip_gen(dim, num_elem, data->max_P_1d, Q_1d, block_sizes); 733b3e1519bSnbeams CeedChkBackend(ierr); 734b3e1519bSnbeams ierr = CeedCompileHip(ceed, code.str().c_str(), &data->module, 2, 735*9e201c85SYohann "T_1D", block_sizes[0], 736b3e1519bSnbeams "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]); 737e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 738*9e201c85SYohann ierr = CeedGetKernelHip(ceed, data->module, operator_name.c_str(), &data->op); 739e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 7407d8d0e25Snbeams 741e15f9bd0SJeremy L Thompson ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 742e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 7437d8d0e25Snbeams } 7447d8d0e25Snbeams //------------------------------------------------------------------------------ 745