13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3241a4b83SYohann // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5241a4b83SYohann // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 73d576824SJeremy L Thompson 8b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12 97df94212SJeremy L Thompson 10ec3da8bcSJed Brown #include <ceed/ceed.h> 11ec3da8bcSJed Brown #include <ceed/backend.h> 12*9e201c85SYohann #include <ceed/jit-tools.h> 133d576824SJeremy L Thompson #include <cuda_runtime.h> 14241a4b83SYohann #include <iostream> 15241a4b83SYohann #include <sstream> 163d576824SJeremy L Thompson #include "ceed-cuda-gen.h" 176d69246aSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 180d0321e0SJeremy L Thompson #include "../cuda-ref/ceed-cuda-ref.h" 19241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 20241a4b83SYohann 21ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 22ab213215SJeremy L Thompson // Build singe operator kernel 23ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 24241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 25241a4b83SYohann 26241a4b83SYohann using std::ostringstream; 27241a4b83SYohann using std::string; 28241a4b83SYohann int ierr; 29*9e201c85SYohann bool is_setup_done; 30*9e201c85SYohann ierr = CeedOperatorIsSetupDone(op, &is_setup_done); CeedChkBackend(ierr); 31*9e201c85SYohann if (is_setup_done) return CEED_ERROR_SUCCESS; 32241a4b83SYohann Ceed ceed; 33e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 34241a4b83SYohann CeedOperator_Cuda_gen *data; 35e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 36241a4b83SYohann CeedQFunction qf; 37241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 38e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 39e15f9bd0SJeremy L Thompson ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 40e79b91d9SJeremy L Thompson CeedSize lsize; 41*9e201c85SYohann CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, 42*9e201c85SYohann num_output_fields, num_comp, dim = 1; 43e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 44*9e201c85SYohann Q_1d = Q; 45*9e201c85SYohann CeedOperatorField *op_input_fields, *op_output_fields; 46*9e201c85SYohann ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields); 47e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 48*9e201c85SYohann CeedQFunctionField *qf_input_fields, *qf_output_fields; 49*9e201c85SYohann ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields); 50e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 51*9e201c85SYohann CeedEvalMode eval_mode; 52241a4b83SYohann CeedBasis basis; 53241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 54241a4b83SYohann CeedElemRestriction Erestrict; 55461525f5SNatalie Beams CeedElemRestriction_Cuda *restr_data; 56241a4b83SYohann 57*9e201c85SYohann // TODO: put in a function? 580b454692Sjeremylt // Check for restriction only identity operator 590b454692Sjeremylt bool is_identity_qf; 600b454692Sjeremylt ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 610b454692Sjeremylt if (is_identity_qf) { 62*9e201c85SYohann CeedEvalMode eval_mode_in, eval_mode_out; 63*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in); CeedChkBackend(ierr); 64*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out); CeedChkBackend(ierr); 65*9e201c85SYohann if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) 660b454692Sjeremylt // LCOV_EXCL_START 670b454692Sjeremylt return CeedError(ceed, CEED_ERROR_BACKEND, 680b454692Sjeremylt "Backend does not implement restriction only identity operators"); 690b454692Sjeremylt // LCOV_EXCL_STOP 700b454692Sjeremylt } 710b454692Sjeremylt 72241a4b83SYohann ostringstream code; 73241a4b83SYohann 74*9e201c85SYohann // TODO: put in a function? 75f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 76f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 77f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 7888db6d3bSJeremy L Thompson ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr); 790d0321e0SJeremy L Thompson ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); CeedChkBackend(ierr); 8080a9ef05SNatalie Beams if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){ 81*9e201c85SYohann char *atomic_add_path, *atomic_add_source; 82*9e201c85SYohann ierr = CeedGetJitAbsolutePath(ceed, 83*9e201c85SYohann "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", 84*9e201c85SYohann &atomic_add_path); CeedChkBackend(ierr); 85*9e201c85SYohann CeedDebug256(ceed, 2, "----- Loading Atomic Add Source -----\n"); 86*9e201c85SYohann ierr = CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source); 87*9e201c85SYohann CeedChkBackend(ierr); 88*9e201c85SYohann code << atomic_add_source; 89*9e201c85SYohann ierr = CeedFree(&atomic_add_path); CeedChkBackend(ierr); 90*9e201c85SYohann ierr = CeedFree(&atomic_add_source); CeedChkBackend(ierr); 91f1a13f77SYohann Dudouit } 92f1a13f77SYohann Dudouit 93*9e201c85SYohann // Load basis source files 94*9e201c85SYohann // TODO: generalize to accept different device functions? 95*9e201c85SYohann { 96*9e201c85SYohann char *tensor_basis_kernel_path, *tensor_basis_kernel_source; 97*9e201c85SYohann ierr = CeedGetJitAbsolutePath(ceed, 98*9e201c85SYohann "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", 99*9e201c85SYohann &tensor_basis_kernel_path); CeedChkBackend(ierr); 100*9e201c85SYohann CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n"); 101*9e201c85SYohann ierr = CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source); 102*9e201c85SYohann CeedChkBackend(ierr); 103*9e201c85SYohann code << tensor_basis_kernel_source; 104*9e201c85SYohann ierr = CeedFree(&tensor_basis_kernel_path); CeedChkBackend(ierr); 105*9e201c85SYohann ierr = CeedFree(&tensor_basis_kernel_source); CeedChkBackend(ierr); 106*9e201c85SYohann } 107*9e201c85SYohann { 108*9e201c85SYohann char *cuda_gen_template_path, *cuda_gen_template_source; 109*9e201c85SYohann ierr = CeedGetJitAbsolutePath(ceed, 110*9e201c85SYohann "ceed/jit-source/cuda/cuda-gen-templates.h", 111*9e201c85SYohann &cuda_gen_template_path); CeedChkBackend(ierr); 112*9e201c85SYohann CeedDebug256(ceed, 2, "----- Loading Cuda-Gen Template Source -----\n"); 113*9e201c85SYohann ierr = CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source); 114*9e201c85SYohann CeedChkBackend(ierr); 115*9e201c85SYohann code << cuda_gen_template_source; 116*9e201c85SYohann ierr = CeedFree(&cuda_gen_template_path); CeedChkBackend(ierr); 117*9e201c85SYohann ierr = CeedFree(&cuda_gen_template_source); CeedChkBackend(ierr); 118*9e201c85SYohann } 119241a4b83SYohann 120*9e201c85SYohann // Get QFunction source and name 121*9e201c85SYohann string q_function_source(qf_data->q_function_source); 122*9e201c85SYohann string q_function_name(qf_data->q_function_name); 123*9e201c85SYohann string operator_name; 124*9e201c85SYohann operator_name = "CeedKernel_Cuda_gen_" + q_function_name; 1254d537eeaSYohann 126*9e201c85SYohann // Find dim, P_1d, Q_1d 127*9e201c85SYohann data->max_P_1d = 0; 128*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 129*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 1301da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 131e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 132*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 133e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1341da99368SJeremy L Thompson 135*9e201c85SYohann // Collect dim, P_1d, and Q_1d 136e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 1371da99368SJeremy L Thompson bool isTensor; 138e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 1391da99368SJeremy L Thompson if (isTensor) { 140*9e201c85SYohann ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 141*9e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 142*9e201c85SYohann if (P_1d > data->max_P_1d) data->max_P_1d = P_1d; 1431da99368SJeremy L Thompson } else { 144e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 145e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 146e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1471da99368SJeremy L Thompson } 1481da99368SJeremy L Thompson } 1491da99368SJeremy L Thompson } 150*9e201c85SYohann // Check output bases for Q_1d, dim as well 151dc64899eSYohann // The only input basis might be CEED_BASIS_COLLOCATED 152*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 153*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 1540f54b25eSjeremylt 1551da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 156e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 157*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 158e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1590f54b25eSjeremylt 160*9e201c85SYohann // Collect Q_1d 161e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 1621da99368SJeremy L Thompson bool isTensor; 163e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 1641da99368SJeremy L Thompson if (isTensor) { 165*9e201c85SYohann ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 1661da99368SJeremy L Thompson } else { 167e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 168e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 169e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1701da99368SJeremy L Thompson } 1711da99368SJeremy L Thompson } 1721da99368SJeremy L Thompson } 1731da99368SJeremy L Thompson data->dim = dim; 174*9e201c85SYohann data->Q_1d = Q_1d; 1751da99368SJeremy L Thompson 176*9e201c85SYohann // Only use 3D collocated gradient parallelization strategy when gradient is computed 177*9e201c85SYohann // TODO: put in a function? 178*9e201c85SYohann bool use_collograd_parallelization = false; 179*9e201c85SYohann if (dim == 3) { 180*9e201c85SYohann bool was_grad_found = false; 181*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 182*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 183*9e201c85SYohann if (eval_mode == CEED_EVAL_GRAD) { 184*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 185*9e201c85SYohann ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 186*9e201c85SYohann use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 187*9e201c85SYohann was_grad_found = true; 188*9e201c85SYohann } 189*9e201c85SYohann } 190*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 191*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 192*9e201c85SYohann if (eval_mode == CEED_EVAL_GRAD) { 193*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 194*9e201c85SYohann ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 195*9e201c85SYohann use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 196*9e201c85SYohann was_grad_found = true; 197*9e201c85SYohann } 198*9e201c85SYohann } 1991da99368SJeremy L Thompson } 2001da99368SJeremy L Thompson 201*9e201c85SYohann // Define CEED_Q_VLA 202*9e201c85SYohann code << "\n#undef CEED_Q_VLA\n"; 203*9e201c85SYohann if (dim != 3 || use_collograd_parallelization) { 204*9e201c85SYohann code << "#define CEED_Q_VLA 1\n\n"; 205*9e201c85SYohann } else { 206*9e201c85SYohann code << "#define CEED_Q_VLA "<<Q_1d<<"\n\n"; 207*9e201c85SYohann } 208*9e201c85SYohann 209*9e201c85SYohann code << q_function_source; 210241a4b83SYohann 211241a4b83SYohann // Setup 212818e0025SJeremy L Thompson code << "\n// -----------------------------------------------------------------------------\n"; 213*9e201c85SYohann code << "\nextern \"C\" __global__ void "<<operator_name<<"(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar* W) {\n"; 214*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 215*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 216e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 217*9e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 218*9e201c85SYohann code << " const CeedScalar* d_u_" <<i<<" = fields.inputs["<<i<<"];\n"; 219241a4b83SYohann } 220241a4b83SYohann } 221241a4b83SYohann 222*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 223*9e201c85SYohann code << " CeedScalar* d_v_"<<i<<" = fields.outputs["<<i<<"];\n"; 224241a4b83SYohann } 2251da99368SJeremy L Thompson 226*9e201c85SYohann code << " const CeedInt dim = "<<dim<<";\n"; 227*9e201c85SYohann code << " const CeedInt Q_1d = "<<Q_1d<<";\n"; 2281da99368SJeremy L Thompson 229241a4b83SYohann code << " extern __shared__ CeedScalar slice[];\n"; 230*9e201c85SYohann // TODO put in a function? InitSharedData_Cuda? 231*9e201c85SYohann code << " SharedData_Cuda data;\n"; 232*9e201c85SYohann code << " data.t_id_x = threadIdx.x;\n"; 233*9e201c85SYohann code << " data.t_id_y = threadIdx.y;\n"; 234*9e201c85SYohann code << " data.t_id_z = threadIdx.z;\n"; 235*9e201c85SYohann code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 236*9e201c85SYohann code << " data.slice = slice+data.t_id_z*T_1D"<<(dim>1?"*T_1D":"")<<";\n"; 237920dcdc4Sjeremylt 238818e0025SJeremy L Thompson code << "\n // -- Input field constants and basis data --\n"; 239*9e201c85SYohann // TODO: Put in a function? 240ac421f39SYohann //Initialize constants, and matrices B and G 241*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 242818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 243*9e201c85SYohann // Get elem_size, eval_mode, num_comp 244*9e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); 245e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 246*9e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 247e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 248*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 249e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 250*9e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 251e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 252920dcdc4Sjeremylt 253920dcdc4Sjeremylt // Set field constants 254*9e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT) { 255*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 256920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 257*9e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 258*9e201c85SYohann code << " const CeedInt P_in_"<<i<<" = "<<P_1d<<";\n"; 259920dcdc4Sjeremylt } else { 260*9e201c85SYohann code << " const CeedInt P_in_"<<i<<" = "<<Q_1d<<";\n"; 261920dcdc4Sjeremylt } 262*9e201c85SYohann code << " const CeedInt num_comp_in_"<<i<<" = "<<num_comp<<";\n"; 263920dcdc4Sjeremylt } 264920dcdc4Sjeremylt 265920dcdc4Sjeremylt // Load basis data 266*9e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 267*9e201c85SYohann switch (eval_mode) { 268241a4b83SYohann case CEED_EVAL_NONE: 269241a4b83SYohann break; 270241a4b83SYohann case CEED_EVAL_INTERP: 271e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 272*9e201c85SYohann data->B.inputs[i] = basis_data->d_interp_1d; 273*9e201c85SYohann code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 274*9e201c85SYohann code << " loadMatrix<P_in_"<<i<<",Q_1d>(data, B.inputs["<<i<<"], s_B_in_"<<i<<");\n"; 275241a4b83SYohann break; 276241a4b83SYohann case CEED_EVAL_GRAD: 277e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 278*9e201c85SYohann data->B.inputs[i] = basis_data->d_interp_1d; 279*9e201c85SYohann code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 280*9e201c85SYohann code << " loadMatrix<P_in_"<<i<<",Q_1d>(data, B.inputs["<<i<<"], s_B_in_"<<i<<");\n"; 281*9e201c85SYohann if (use_collograd_parallelization) { 282*9e201c85SYohann data->G.inputs[i] = basis_data->d_collo_grad_1d; 283*9e201c85SYohann code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q_1d*Q_1d<<"];\n"; 284*9e201c85SYohann code << " loadMatrix<Q_1d,Q_1d>(data, G.inputs["<<i<<"], s_G_in_"<<i<<");\n"; 285ac421f39SYohann } else { 286*9e201c85SYohann bool has_collo_grad = !!basis_data->d_collo_grad_1d; 287*9e201c85SYohann data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 288*9e201c85SYohann code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q_1d*(has_collo_grad?Q_1d:P_1d)<<"];\n"; 289*9e201c85SYohann code << " loadMatrix<"<<(has_collo_grad?"Q_1d":("P_in_"+std::to_string(i)))<<",Q_1d>(data, G.inputs["<<i<<"], s_G_in_"<<i<<");\n"; 290ac421f39SYohann } 291241a4b83SYohann break; 292241a4b83SYohann case CEED_EVAL_WEIGHT: 293241a4b83SYohann break; // No action 294241a4b83SYohann case CEED_EVAL_DIV: 295241a4b83SYohann break; // TODO: Not implemented 296241a4b83SYohann case CEED_EVAL_CURL: 297241a4b83SYohann break; // TODO: Not implemented 298241a4b83SYohann } 299241a4b83SYohann } 300920dcdc4Sjeremylt 301818e0025SJeremy L Thompson code << "\n // -- Output field constants and basis data --\n"; 302*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 303818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 304*9e201c85SYohann // Get elem_size, eval_mode, num_comp 305*9e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict); 306e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 307*9e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 308e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 309*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 310e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 311*9e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 312e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 313920dcdc4Sjeremylt 314920dcdc4Sjeremylt // Set field constants 315*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 316920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 317*9e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 318*9e201c85SYohann code << " const CeedInt P_out_"<<i<<" = "<<P_1d<<";\n"; 319920dcdc4Sjeremylt } else { 320*9e201c85SYohann code << " const CeedInt P_out_"<<i<<" = "<<Q_1d<<";\n"; 321920dcdc4Sjeremylt } 322*9e201c85SYohann code << " const CeedInt num_comp_out_"<<i<<" = "<<num_comp<<";\n"; 323920dcdc4Sjeremylt 324920dcdc4Sjeremylt // Load basis data 325*9e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 326*9e201c85SYohann switch (eval_mode) { 327920dcdc4Sjeremylt case CEED_EVAL_NONE: 328920dcdc4Sjeremylt break; // No action 329920dcdc4Sjeremylt case CEED_EVAL_INTERP: 330e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 331*9e201c85SYohann data->B.outputs[i] = basis_data->d_interp_1d; 332*9e201c85SYohann code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 333*9e201c85SYohann code << " loadMatrix<P_out_"<<i<<",Q_1d>(data, B.outputs["<<i<<"], s_B_out_"<<i<<");\n"; 334241a4b83SYohann break; 335241a4b83SYohann case CEED_EVAL_GRAD: 336e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 337*9e201c85SYohann data->B.outputs[i] = basis_data->d_interp_1d; 338*9e201c85SYohann code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 339*9e201c85SYohann code << " loadMatrix<P_out_"<<i<<",Q_1d>(data, B.outputs["<<i<<"], s_B_out_"<<i<<");\n"; 340*9e201c85SYohann if (use_collograd_parallelization) { 341*9e201c85SYohann data->G.outputs[i] = basis_data->d_collo_grad_1d; 342*9e201c85SYohann code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q_1d*Q_1d<<"];\n"; 343*9e201c85SYohann code << " loadMatrix<Q_1d,Q_1d>(data, G.outputs["<<i<<"], s_G_out_"<<i<<");\n"; 344ac421f39SYohann } else { 345*9e201c85SYohann bool has_collo_grad = !!basis_data->d_collo_grad_1d; 346*9e201c85SYohann data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 347*9e201c85SYohann code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q_1d*(has_collo_grad?Q_1d:P_1d)<<"];\n"; 348*9e201c85SYohann code << " loadMatrix<"<<(has_collo_grad?"Q_1d":("P_out_"+std::to_string(i)))<<",Q_1d>(data, G.outputs["<<i<<"], s_G_out_"<<i<<");\n"; 349ac421f39SYohann } 350241a4b83SYohann break; 351e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 352241a4b83SYohann case CEED_EVAL_WEIGHT: { 353241a4b83SYohann Ceed ceed; 354e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 355e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 356241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 357241a4b83SYohann break; // Should not occur 358241a4b83SYohann } 359241a4b83SYohann case CEED_EVAL_DIV: 360241a4b83SYohann break; // TODO: Not implemented 361241a4b83SYohann case CEED_EVAL_CURL: 362241a4b83SYohann break; // TODO: Not implemented 363e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 364241a4b83SYohann } 365241a4b83SYohann } 366818e0025SJeremy L Thompson code << "\n // -- Element loop --\n"; 367ac421f39SYohann code << " __syncthreads();\n"; 368*9e201c85SYohann code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 369241a4b83SYohann // Input basis apply if needed 370ac421f39SYohann // Generate the correct eval mode code for each input 371818e0025SJeremy L Thompson code << " // -- Input field restrictions and basis actions --\n"; 372*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 373818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 374*9e201c85SYohann // Get elem_size, eval_mode, num_comp 375*9e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); 376e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 377*9e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 378e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 379*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 380e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 381*9e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 382e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 383920dcdc4Sjeremylt 384*9e201c85SYohann // TODO: put in a function? 385920dcdc4Sjeremylt // Restriction 386*9e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT && 387*9e201c85SYohann !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { 388*9e201c85SYohann code << " CeedScalar r_u_"<<i<<"[num_comp_in_"<<i<<"*P_in_"<<i<<"];\n"; 3899a2291e3SJeremy L Thompson 390*9e201c85SYohann bool is_strided; 391*9e201c85SYohann ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 392*9e201c85SYohann if (!is_strided) { 3935c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 394e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3955c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 396*9e201c85SYohann CeedInt comp_stride; 397*9e201c85SYohann ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 398*9e201c85SYohann code << " // CompStride: "<<comp_stride<<"\n"; 399e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 400*9e201c85SYohann data->indices.inputs[i] = restr_data->d_ind; 401*9e201c85SYohann code << " readDofsOffset"<<dim<<"d<num_comp_in_"<<i<<", "<<comp_stride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.inputs["<<i<<"], d_u_"<<i<<", r_u_"<<i<<");\n"; 402ccedf6b0Sjeremylt } else { 403920dcdc4Sjeremylt bool backendstrides; 404fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 405e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 406*9e201c85SYohann CeedInt num_elem; 407*9e201c85SYohann ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 408e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 409*9e201c85SYohann CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 410920dcdc4Sjeremylt if (!backendstrides) { 411920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 412e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 413ccedf6b0Sjeremylt } 414920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 415*9e201c85SYohann code << " readDofsStrided"<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u_"<<i<<", r_u_"<<i<<");\n"; 416920dcdc4Sjeremylt } 417920dcdc4Sjeremylt } 418920dcdc4Sjeremylt 419*9e201c85SYohann // TODO: put in a function? 420920dcdc4Sjeremylt // Basis action 421*9e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 422*9e201c85SYohann switch (eval_mode) { 423920dcdc4Sjeremylt case CEED_EVAL_NONE: 424*9e201c85SYohann if (!use_collograd_parallelization) { 425*9e201c85SYohann code << " CeedScalar* r_t_"<<i<<" = r_u_"<<i<<";\n"; 426920dcdc4Sjeremylt } 427920dcdc4Sjeremylt break; 428920dcdc4Sjeremylt case CEED_EVAL_INTERP: 429*9e201c85SYohann code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*Q_1d];\n"; 430*9e201c85SYohann code << " Interp"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", r_t_"<<i<<");\n"; 431241a4b83SYohann break; 432241a4b83SYohann case CEED_EVAL_GRAD: 433*9e201c85SYohann if (use_collograd_parallelization) { 434*9e201c85SYohann code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*Q_1d];\n"; 435*9e201c85SYohann code << " Interp"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", r_t_"<<i<<");\n"; 436ac421f39SYohann } else { 437*9e201c85SYohann CeedInt P_1d; 438*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 439*9e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 440*9e201c85SYohann code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*dim*Q_1d];\n"; 441*9e201c85SYohann code << " Grad"<<(dim>1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t_"<<i<<");\n"; 442ac421f39SYohann } 443241a4b83SYohann break; 444241a4b83SYohann case CEED_EVAL_WEIGHT: 445*9e201c85SYohann code << " CeedScalar r_t_"<<i<<"[Q_1d];\n"; 446*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 447e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 448437930d1SJeremy L Thompson data->W = basis_data->d_q_weight_1d; 449*9e201c85SYohann code << " Weight"<<(dim>1?"Tensor":"")<<dim<<"d<Q_1d>(data, W, r_t_"<<i<<");\n"; 450241a4b83SYohann break; // No action 451241a4b83SYohann case CEED_EVAL_DIV: 452241a4b83SYohann break; // TODO: Not implemented 453241a4b83SYohann case CEED_EVAL_CURL: 454241a4b83SYohann break; // TODO: Not implemented 455241a4b83SYohann } 456241a4b83SYohann } 457ac421f39SYohann 458*9e201c85SYohann // TODO: put in a function + separate colograd logic 459241a4b83SYohann // Q function 460818e0025SJeremy L Thompson code << "\n // -- Output field setup --\n"; 461*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 462818e0025SJeremy L Thompson code << "\n // ---- Output field "<<i<<" ----\n"; 463*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 464e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 465*9e201c85SYohann if (eval_mode==CEED_EVAL_GRAD) 466241a4b83SYohann { 467*9e201c85SYohann if (use_collograd_parallelization) { 468ac421f39SYohann //Accumulator for gradient slices 469*9e201c85SYohann code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*Q_1d];\n"; 470*9e201c85SYohann code << " for (CeedInt i = 0; i < num_comp_out_"<<i<<"; i++) {\n"; 471*9e201c85SYohann code << " for (CeedInt j = 0; j < Q_1d; ++j) {\n"; 472*9e201c85SYohann code << " r_tt_"<<i<<"[j + i*Q_1d] = 0.0;\n"; 473ac421f39SYohann code << " }\n"; 474ac421f39SYohann code << " }\n"; 475ac421f39SYohann } else { 476*9e201c85SYohann code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*dim*Q_1d];\n"; 477241a4b83SYohann } 478ac421f39SYohann } 479*9e201c85SYohann if (eval_mode==CEED_EVAL_NONE || eval_mode==CEED_EVAL_INTERP) 480241a4b83SYohann { 481*9e201c85SYohann code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*Q_1d];\n"; 482241a4b83SYohann } 483241a4b83SYohann } 484ac421f39SYohann // We treat quadrature points per slice in 3d to save registers 485*9e201c85SYohann if (use_collograd_parallelization) { 486*9e201c85SYohann code << "\n // Note: Using planes of 3D elements\n"; 487ac421f39SYohann code << "#pragma unroll\n"; 488*9e201c85SYohann code << " for (CeedInt q = 0; q < Q_1d; q++) {\n"; 489818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 490*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 491818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 492*9e201c85SYohann // Get elem_size, eval_mode, num_comp 493*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 494e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 495ac421f39SYohann // Basis action 496*9e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 497*9e201c85SYohann switch (eval_mode) { 498ac421f39SYohann case CEED_EVAL_NONE: 499*9e201c85SYohann code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"];\n"; 5009a2291e3SJeremy L Thompson 501*9e201c85SYohann bool is_strided; 502*9e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); CeedChkBackend(ierr); 503*9e201c85SYohann ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 504*9e201c85SYohann if (!is_strided) { 5055c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 506e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 5075c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 508*9e201c85SYohann CeedInt comp_stride; 509*9e201c85SYohann ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 510*9e201c85SYohann code << " // CompStride: "<<comp_stride<<"\n"; 511e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 512*9e201c85SYohann data->indices.inputs[i] = restr_data->d_ind; 513*9e201c85SYohann code << " readSliceQuadsOffset"<<"3d<num_comp_in_"<<i<<", "<<comp_stride<<", Q_1d>(data, lsize_in_"<<i<<", elem, q, indices.inputs["<<i<<"], d_u_"<<i<<", r_q_"<<i<<");\n"; 514920dcdc4Sjeremylt } else { 515*9e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); CeedChkBackend(ierr); 516920dcdc4Sjeremylt bool backendstrides; 517fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 518e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 519*9e201c85SYohann CeedInt num_elem; 520*9e201c85SYohann ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 521e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 522*9e201c85SYohann CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 523920dcdc4Sjeremylt if (!backendstrides) { 524920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 525e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 526920dcdc4Sjeremylt } 527920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 528*9e201c85SYohann code << " readSliceQuadsStrided"<<"3d<num_comp_in_"<<i<<",Q_1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u_"<<i<<", r_q_"<<i<<");\n"; 529920dcdc4Sjeremylt } 530ac421f39SYohann break; 531ac421f39SYohann case CEED_EVAL_INTERP: 532*9e201c85SYohann code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"];\n"; 533*9e201c85SYohann code << " for (CeedInt j = 0; j < num_comp_in_"<<i<<" ; ++j) {\n"; 534*9e201c85SYohann code << " r_q_"<<i<<"[j] = r_t_"<<i<<"[q + j*Q_1d];\n"; 535ac421f39SYohann code << " }\n"; 536ac421f39SYohann break; 537ac421f39SYohann case CEED_EVAL_GRAD: 538*9e201c85SYohann code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"*dim];\n"; 539*9e201c85SYohann code << " gradCollo3d<num_comp_in_"<<i<<",Q_1d>(data, q, r_t_"<<i<<", s_G_in_"<<i<<", r_q_"<<i<<");\n"; 540ac421f39SYohann break; 541ac421f39SYohann case CEED_EVAL_WEIGHT: 542*9e201c85SYohann code << " CeedScalar r_q_"<<i<<"[1];\n"; 543*9e201c85SYohann code << " r_q_"<<i<<"[0] = r_t_"<<i<<"[q];\n"; 544ac421f39SYohann break; // No action 545ac421f39SYohann case CEED_EVAL_DIV: 546ac421f39SYohann break; // TODO: Not implemented 547ac421f39SYohann case CEED_EVAL_CURL: 548ac421f39SYohann break; // TODO: Not implemented 549ac421f39SYohann } 550ac421f39SYohann } 551818e0025SJeremy L Thompson code << "\n // -- Output fields --\n"; 552*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 553818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 554*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 555e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 556ac421f39SYohann // Basis action 557*9e201c85SYohann switch (eval_mode) { 558ac421f39SYohann case CEED_EVAL_NONE: 559*9e201c85SYohann code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"];\n"; 560ac421f39SYohann break; // No action 561ac421f39SYohann case CEED_EVAL_INTERP: 562*9e201c85SYohann code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"];\n"; 563ac421f39SYohann break; 564ac421f39SYohann case CEED_EVAL_GRAD: 565*9e201c85SYohann code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"*dim];\n"; 566ac421f39SYohann break; 567ac421f39SYohann case CEED_EVAL_WEIGHT: 568ac421f39SYohann break; // Should not occur 569ac421f39SYohann case CEED_EVAL_DIV: 570ac421f39SYohann break; // TODO: Not implemented 571ac421f39SYohann case CEED_EVAL_CURL: 572ac421f39SYohann break; // TODO: Not implemented 573ac421f39SYohann } 574ac421f39SYohann } 575ac421f39SYohann } else { 576*9e201c85SYohann code << "\n // Note: Using full elements\n"; 577818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 578*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 579818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 580*9e201c85SYohann code << " CeedScalar* r_q_"<<i<<" = r_t_"<<i<<";\n"; 581ac421f39SYohann } 582818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 583*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 584818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 585*9e201c85SYohann code << " CeedScalar* r_qq_"<<i<<" = r_tt_"<<i<<";\n"; 586ac421f39SYohann } 587ac421f39SYohann } 588818e0025SJeremy L Thompson code << "\n // -- QFunction Inputs and outputs --\n"; 589*9e201c85SYohann code << " CeedScalar* in["<<num_input_fields<<"];\n"; 590*9e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 591818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 592*9e201c85SYohann code << " in["<<i<<"] = r_q_"<<i<<";\n"; 5934d537eeaSYohann } 594*9e201c85SYohann code << " CeedScalar* out["<<num_output_fields<<"];\n"; 595*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 596818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 597*9e201c85SYohann code << " out["<<i<<"] = r_qq_"<<i<<";\n"; 5984d537eeaSYohann } 599818e0025SJeremy L Thompson code << "\n // -- Apply QFunction --\n"; 600*9e201c85SYohann code << " "<<q_function_name<<"(ctx, "; 601*9e201c85SYohann if (dim != 3 || use_collograd_parallelization) { 602ac421f39SYohann code << "1"; 603ac421f39SYohann } else { 604*9e201c85SYohann code << "Q_1d"; 605ac421f39SYohann } 606ac421f39SYohann code << ", in, out);\n"; 607*9e201c85SYohann if (use_collograd_parallelization) { 608818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 609*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 610818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 611*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 612e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 613ac421f39SYohann // Basis action 614*9e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 615*9e201c85SYohann switch (eval_mode) { 616ac421f39SYohann case CEED_EVAL_NONE: 617*9e201c85SYohann code << " for (CeedInt j = 0; j < num_comp_out_"<<i<<" ; ++j) {\n"; 618*9e201c85SYohann code << " r_tt_"<<i<<"[q + j*Q_1d] = r_qq_"<<i<<"[j];\n"; 619ac421f39SYohann code << " }\n"; 620ac421f39SYohann break; // No action 621ac421f39SYohann case CEED_EVAL_INTERP: 622*9e201c85SYohann code << " for (CeedInt j = 0; j < num_comp_out_"<<i<<" ; ++j) {\n"; 623*9e201c85SYohann code << " r_tt_"<<i<<"[q + j*Q_1d] = r_qq_"<<i<<"[j];\n"; 624ac421f39SYohann code << " }\n"; 625ac421f39SYohann break; 626ac421f39SYohann case CEED_EVAL_GRAD: 627*9e201c85SYohann code << " gradColloTranspose3d<num_comp_out_"<<i<<",Q_1d>(data, q, r_qq_"<<i<<", s_G_out_"<<i<<", r_tt_"<<i<<");\n"; 628ac421f39SYohann break; 629ac421f39SYohann case CEED_EVAL_WEIGHT: 630ac421f39SYohann break; // Should not occur 631ac421f39SYohann case CEED_EVAL_DIV: 632ac421f39SYohann break; // TODO: Not implemented 633ac421f39SYohann case CEED_EVAL_CURL: 634ac421f39SYohann break; // TODO: Not implemented 635ac421f39SYohann } 636ac421f39SYohann } 637ac421f39SYohann code << " }\n"; 638ac421f39SYohann } 639241a4b83SYohann 640241a4b83SYohann // Output basis apply if needed 641ac421f39SYohann // Generate the correct eval mode code for each output 642818e0025SJeremy L Thompson code << "\n // -- Output field basis action and restrictions --\n"; 643*9e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 644818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 645*9e201c85SYohann // Get elem_size, eval_mode, num_comp 646*9e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict); 647e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 648*9e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 649e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 650*9e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 651e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 652*9e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 653e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 654*9e201c85SYohann // TODO put in a function 655241a4b83SYohann // Basis action 656*9e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 657*9e201c85SYohann switch (eval_mode) { 658241a4b83SYohann case CEED_EVAL_NONE: 659*9e201c85SYohann code << " CeedScalar* r_v_"<<i<<" = r_tt_"<<i<<";\n"; 660241a4b83SYohann break; // No action 661241a4b83SYohann case CEED_EVAL_INTERP: 662*9e201c85SYohann code << " CeedScalar r_v_"<<i<<"[num_comp_out_"<<i<<"*P_out_"<<i<<"];\n"; 663*9e201c85SYohann code << " InterpTranspose"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", r_v_"<<i<<");\n"; 664241a4b83SYohann break; 665241a4b83SYohann case CEED_EVAL_GRAD: 666*9e201c85SYohann code << " CeedScalar r_v_"<<i<<"[num_comp_out_"<<i<<"*P_out_"<<i<<"];\n"; 667*9e201c85SYohann if (use_collograd_parallelization) { 668*9e201c85SYohann code << " InterpTranspose"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", r_v_"<<i<<");\n"; 669ac421f39SYohann } else { 670*9e201c85SYohann CeedInt P_1d; 671*9e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 672*9e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 673*9e201c85SYohann code << " GradTranspose"<<(dim>1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v_"<<i<<");\n"; 674ac421f39SYohann } 675241a4b83SYohann break; 676e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 677241a4b83SYohann case CEED_EVAL_WEIGHT: { 678241a4b83SYohann Ceed ceed; 679e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 680e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 681241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 682241a4b83SYohann break; // Should not occur 683241a4b83SYohann } 684241a4b83SYohann case CEED_EVAL_DIV: 685241a4b83SYohann break; // TODO: Not implemented 686241a4b83SYohann case CEED_EVAL_CURL: 687241a4b83SYohann break; // TODO: Not implemented 688e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 689241a4b83SYohann } 690*9e201c85SYohann // TODO put in a function 691920dcdc4Sjeremylt // Restriction 692*9e201c85SYohann bool is_strided; 693*9e201c85SYohann ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 694*9e201c85SYohann if (!is_strided) { 6955c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 696e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6975c7b696cSjeremylt code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 698*9e201c85SYohann CeedInt comp_stride; 699*9e201c85SYohann ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 700*9e201c85SYohann code << " // CompStride: "<<comp_stride<<"\n"; 701e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 702*9e201c85SYohann data->indices.outputs[i] = restr_data->d_ind; 703*9e201c85SYohann code << " writeDofsOffset"<<dim<<"d<num_comp_out_"<<i<<", "<<comp_stride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.outputs["<<i<<"], r_v_"<<i<<", d_v_"<<i<<");\n"; 704920dcdc4Sjeremylt } else { 705*9e201c85SYohann bool has_backend_strides; 706*9e201c85SYohann ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); 707e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 708*9e201c85SYohann CeedInt num_elem; 709*9e201c85SYohann ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 710e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 711*9e201c85SYohann CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 712*9e201c85SYohann if (!has_backend_strides) { 713920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 714e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 715920dcdc4Sjeremylt } 716920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 717*9e201c85SYohann code << " writeDofsStrided"<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v_"<<i<<", d_v_"<<i<<");\n"; 718920dcdc4Sjeremylt } 719241a4b83SYohann } 720241a4b83SYohann 721241a4b83SYohann code << " }\n"; 722818e0025SJeremy L Thompson code << "}\n"; 723818e0025SJeremy L Thompson code << "// -----------------------------------------------------------------------------\n\n"; 724241a4b83SYohann 725ab213215SJeremy L Thompson // View kernel for debugging 72646dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 7273f21f6b1SJeremy L Thompson CeedDebug(ceed, code.str().c_str()); 728241a4b83SYohann 72918d499f1SYohann ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, 730*9e201c85SYohann "T_1D", CeedIntMax(Q_1d, data->max_P_1d)); 731e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 732*9e201c85SYohann ierr = CeedGetKernelCuda(ceed, data->module, operator_name.c_str(), &data->op); 733e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 734241a4b83SYohann 735e15f9bd0SJeremy L Thompson ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 736e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 737241a4b83SYohann } 738ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 739