13d8e8822SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3241a4b83SYohann // 43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 5241a4b83SYohann // 63d8e8822SJeremy L Thompson // This file is part of CEED: http://github.com/ceed 73d576824SJeremy L Thompson 8b8e71988SJeremy L Thompson #define CEED_DEBUG_COLOR 12 97df94212SJeremy L Thompson 10ec3da8bcSJed Brown #include <ceed/ceed.h> 11ec3da8bcSJed Brown #include <ceed/backend.h> 129e201c85SYohann #include <ceed/jit-tools.h> 133d576824SJeremy L Thompson #include <cuda_runtime.h> 14241a4b83SYohann #include <iostream> 15241a4b83SYohann #include <sstream> 163d576824SJeremy L Thompson #include "ceed-cuda-gen.h" 176d69246aSJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 180d0321e0SJeremy L Thompson #include "../cuda-ref/ceed-cuda-ref.h" 19241a4b83SYohann #include "../cuda-shared/ceed-cuda-shared.h" 20241a4b83SYohann 21ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 22ab213215SJeremy L Thompson // Build singe operator kernel 23ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 24241a4b83SYohann extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { 25241a4b83SYohann 26241a4b83SYohann using std::ostringstream; 27241a4b83SYohann using std::string; 28241a4b83SYohann int ierr; 299e201c85SYohann bool is_setup_done; 309e201c85SYohann ierr = CeedOperatorIsSetupDone(op, &is_setup_done); CeedChkBackend(ierr); 319e201c85SYohann if (is_setup_done) return CEED_ERROR_SUCCESS; 32241a4b83SYohann Ceed ceed; 33e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 34241a4b83SYohann CeedOperator_Cuda_gen *data; 35e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetData(op, &data); CeedChkBackend(ierr); 36241a4b83SYohann CeedQFunction qf; 37241a4b83SYohann CeedQFunction_Cuda_gen *qf_data; 38e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetQFunction(op, &qf); CeedChkBackend(ierr); 39e15f9bd0SJeremy L Thompson ierr = CeedQFunctionGetData(qf, &qf_data); CeedChkBackend(ierr); 40e79b91d9SJeremy L Thompson CeedSize lsize; 419e201c85SYohann CeedInt Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, 429e201c85SYohann num_output_fields, num_comp, dim = 1; 43e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChkBackend(ierr); 449e201c85SYohann Q_1d = Q; 459e201c85SYohann CeedOperatorField *op_input_fields, *op_output_fields; 469e201c85SYohann ierr = CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields); 47e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 489e201c85SYohann CeedQFunctionField *qf_input_fields, *qf_output_fields; 499e201c85SYohann ierr = CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields); 50e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 519e201c85SYohann CeedEvalMode eval_mode; 52241a4b83SYohann CeedBasis basis; 53241a4b83SYohann CeedBasis_Cuda_shared *basis_data; 54241a4b83SYohann CeedElemRestriction Erestrict; 55461525f5SNatalie Beams CeedElemRestriction_Cuda *restr_data; 56241a4b83SYohann 579e201c85SYohann // TODO: put in a function? 580b454692Sjeremylt // Check for restriction only identity operator 590b454692Sjeremylt bool is_identity_qf; 600b454692Sjeremylt ierr = CeedQFunctionIsIdentity(qf, &is_identity_qf); CeedChkBackend(ierr); 610b454692Sjeremylt if (is_identity_qf) { 629e201c85SYohann CeedEvalMode eval_mode_in, eval_mode_out; 639e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in); CeedChkBackend(ierr); 649e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out); CeedChkBackend(ierr); 659e201c85SYohann if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) 660b454692Sjeremylt // LCOV_EXCL_START 670b454692Sjeremylt return CeedError(ceed, CEED_ERROR_BACKEND, 680b454692Sjeremylt "Backend does not implement restriction only identity operators"); 690b454692Sjeremylt // LCOV_EXCL_STOP 700b454692Sjeremylt } 710b454692Sjeremylt 72241a4b83SYohann ostringstream code; 73241a4b83SYohann 749e201c85SYohann // TODO: put in a function? 75f1a13f77SYohann Dudouit // Add atomicAdd function for old NVidia architectures 76f1a13f77SYohann Dudouit struct cudaDeviceProp prop; 77f1a13f77SYohann Dudouit Ceed_Cuda *ceed_data; 7888db6d3bSJeremy L Thompson ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr); CeedChkBackend(ierr); 790d0321e0SJeremy L Thompson ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id); CeedChkBackend(ierr); 8080a9ef05SNatalie Beams if ((prop.major < 6) && (CEED_SCALAR_TYPE != CEED_SCALAR_FP32)){ 819e201c85SYohann char *atomic_add_path, *atomic_add_source; 829e201c85SYohann ierr = CeedGetJitAbsolutePath(ceed, 839e201c85SYohann "ceed/jit-source/cuda/cuda-atomic-add-fallback.h", 849e201c85SYohann &atomic_add_path); CeedChkBackend(ierr); 859e201c85SYohann CeedDebug256(ceed, 2, "----- Loading Atomic Add Source -----\n"); 869e201c85SYohann ierr = CeedLoadSourceToBuffer(ceed, atomic_add_path, &atomic_add_source); 879e201c85SYohann CeedChkBackend(ierr); 889e201c85SYohann code << atomic_add_source; 899e201c85SYohann ierr = CeedFree(&atomic_add_path); CeedChkBackend(ierr); 909e201c85SYohann ierr = CeedFree(&atomic_add_source); CeedChkBackend(ierr); 91f1a13f77SYohann Dudouit } 92f1a13f77SYohann Dudouit 939e201c85SYohann // Load basis source files 949e201c85SYohann // TODO: generalize to accept different device functions? 959e201c85SYohann { 969e201c85SYohann char *tensor_basis_kernel_path, *tensor_basis_kernel_source; 979e201c85SYohann ierr = CeedGetJitAbsolutePath(ceed, 989e201c85SYohann "ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h", 999e201c85SYohann &tensor_basis_kernel_path); CeedChkBackend(ierr); 1009e201c85SYohann CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n"); 1019e201c85SYohann ierr = CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_kernel_source); 1029e201c85SYohann CeedChkBackend(ierr); 1039e201c85SYohann code << tensor_basis_kernel_source; 1049e201c85SYohann ierr = CeedFree(&tensor_basis_kernel_path); CeedChkBackend(ierr); 1059e201c85SYohann ierr = CeedFree(&tensor_basis_kernel_source); CeedChkBackend(ierr); 1069e201c85SYohann } 1079e201c85SYohann { 1089e201c85SYohann char *cuda_gen_template_path, *cuda_gen_template_source; 1099e201c85SYohann ierr = CeedGetJitAbsolutePath(ceed, 1109e201c85SYohann "ceed/jit-source/cuda/cuda-gen-templates.h", 1119e201c85SYohann &cuda_gen_template_path); CeedChkBackend(ierr); 1129e201c85SYohann CeedDebug256(ceed, 2, "----- Loading Cuda-Gen Template Source -----\n"); 1139e201c85SYohann ierr = CeedLoadSourceToBuffer(ceed, cuda_gen_template_path, &cuda_gen_template_source); 1149e201c85SYohann CeedChkBackend(ierr); 1159e201c85SYohann code << cuda_gen_template_source; 1169e201c85SYohann ierr = CeedFree(&cuda_gen_template_path); CeedChkBackend(ierr); 1179e201c85SYohann ierr = CeedFree(&cuda_gen_template_source); CeedChkBackend(ierr); 1189e201c85SYohann } 119241a4b83SYohann 1209e201c85SYohann // Get QFunction source and name 1219e201c85SYohann string q_function_source(qf_data->q_function_source); 1229e201c85SYohann string q_function_name(qf_data->q_function_name); 1239e201c85SYohann string operator_name; 124*204bfdd7SJeremy L Thompson operator_name = "CeedKernelCudaGenOperator_" + q_function_name; 1254d537eeaSYohann 1269e201c85SYohann // Find dim, P_1d, Q_1d 1279e201c85SYohann data->max_P_1d = 0; 1289e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 1299e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 1301da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 131e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1329e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 133e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1341da99368SJeremy L Thompson 1359e201c85SYohann // Collect dim, P_1d, and Q_1d 136e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 1371da99368SJeremy L Thompson bool isTensor; 138e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 1391da99368SJeremy L Thompson if (isTensor) { 1409e201c85SYohann ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 1419e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 1429e201c85SYohann if (P_1d > data->max_P_1d) data->max_P_1d = P_1d; 1431da99368SJeremy L Thompson } else { 144e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 145e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 146e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1471da99368SJeremy L Thompson } 1481da99368SJeremy L Thompson } 1491da99368SJeremy L Thompson } 1509e201c85SYohann // Check output bases for Q_1d, dim as well 151dc64899eSYohann // The only input basis might be CEED_BASIS_COLLOCATED 1529e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 1539e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 1540f54b25eSjeremylt 1551da99368SJeremy L Thompson if (basis != CEED_BASIS_COLLOCATED) { 156e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1579e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 158e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 1590f54b25eSjeremylt 1609e201c85SYohann // Collect Q_1d 161e15f9bd0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 1621da99368SJeremy L Thompson bool isTensor; 163e15f9bd0SJeremy L Thompson ierr = CeedBasisIsTensor(basis, &isTensor); CeedChkBackend(ierr); 1641da99368SJeremy L Thompson if (isTensor) { 1659e201c85SYohann ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d); CeedChkBackend(ierr); 1661da99368SJeremy L Thompson } else { 167e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 168e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis"); 169e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 1701da99368SJeremy L Thompson } 1711da99368SJeremy L Thompson } 1721da99368SJeremy L Thompson } 1731da99368SJeremy L Thompson data->dim = dim; 1749e201c85SYohann data->Q_1d = Q_1d; 1751da99368SJeremy L Thompson 1769e201c85SYohann // Only use 3D collocated gradient parallelization strategy when gradient is computed 1779e201c85SYohann // TODO: put in a function? 1789e201c85SYohann bool use_collograd_parallelization = false; 1799e201c85SYohann if (dim == 3) { 1809e201c85SYohann bool was_grad_found = false; 1819e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 1829e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 1839e201c85SYohann if (eval_mode == CEED_EVAL_GRAD) { 1849e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 1859e201c85SYohann ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1869e201c85SYohann use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 1879e201c85SYohann was_grad_found = true; 1889e201c85SYohann } 1899e201c85SYohann } 1909e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 1919e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 1929e201c85SYohann if (eval_mode == CEED_EVAL_GRAD) { 1939e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 1949e201c85SYohann ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 1959e201c85SYohann use_collograd_parallelization = !!basis_data->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true); 1969e201c85SYohann was_grad_found = true; 1979e201c85SYohann } 1989e201c85SYohann } 1991da99368SJeremy L Thompson } 2001da99368SJeremy L Thompson 2019e201c85SYohann // Define CEED_Q_VLA 2029e201c85SYohann code << "\n#undef CEED_Q_VLA\n"; 2039e201c85SYohann if (dim != 3 || use_collograd_parallelization) { 2049e201c85SYohann code << "#define CEED_Q_VLA 1\n\n"; 2059e201c85SYohann } else { 2069e201c85SYohann code << "#define CEED_Q_VLA "<<Q_1d<<"\n\n"; 2079e201c85SYohann } 2089e201c85SYohann 2099e201c85SYohann code << q_function_source; 210241a4b83SYohann 211241a4b83SYohann // Setup 212818e0025SJeremy L Thompson code << "\n// -----------------------------------------------------------------------------\n"; 2139e201c85SYohann code << "\nextern \"C\" __global__ void "<<operator_name<<"(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar* W) {\n"; 2149e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 2159e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 216e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2179e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT) { // Skip CEED_EVAL_WEIGHT 2189e201c85SYohann code << " const CeedScalar* d_u_" <<i<<" = fields.inputs["<<i<<"];\n"; 219241a4b83SYohann } 220241a4b83SYohann } 221241a4b83SYohann 2229e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 2239e201c85SYohann code << " CeedScalar* d_v_"<<i<<" = fields.outputs["<<i<<"];\n"; 224241a4b83SYohann } 2251da99368SJeremy L Thompson 2269e201c85SYohann code << " const CeedInt dim = "<<dim<<";\n"; 2279e201c85SYohann code << " const CeedInt Q_1d = "<<Q_1d<<";\n"; 2281da99368SJeremy L Thompson 229241a4b83SYohann code << " extern __shared__ CeedScalar slice[];\n"; 2309e201c85SYohann // TODO put in a function? InitSharedData_Cuda? 2319e201c85SYohann code << " SharedData_Cuda data;\n"; 2329e201c85SYohann code << " data.t_id_x = threadIdx.x;\n"; 2339e201c85SYohann code << " data.t_id_y = threadIdx.y;\n"; 2349e201c85SYohann code << " data.t_id_z = threadIdx.z;\n"; 2359e201c85SYohann code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; 2369e201c85SYohann code << " data.slice = slice+data.t_id_z*T_1D"<<(dim>1?"*T_1D":"")<<";\n"; 237920dcdc4Sjeremylt 238818e0025SJeremy L Thompson code << "\n // -- Input field constants and basis data --\n"; 2399e201c85SYohann // TODO: Put in a function? 240ac421f39SYohann //Initialize constants, and matrices B and G 2419e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 242818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 2439e201c85SYohann // Get elem_size, eval_mode, num_comp 2449e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); 245e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2469e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 247e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2489e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 249e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 2509e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 251e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 252920dcdc4Sjeremylt 253920dcdc4Sjeremylt // Set field constants 2549e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT) { 2559e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 256920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 2579e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 2589e201c85SYohann code << " const CeedInt P_in_"<<i<<" = "<<P_1d<<";\n"; 259920dcdc4Sjeremylt } else { 2609e201c85SYohann code << " const CeedInt P_in_"<<i<<" = "<<Q_1d<<";\n"; 261920dcdc4Sjeremylt } 2629e201c85SYohann code << " const CeedInt num_comp_in_"<<i<<" = "<<num_comp<<";\n"; 263920dcdc4Sjeremylt } 264920dcdc4Sjeremylt 265920dcdc4Sjeremylt // Load basis data 2669e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 2679e201c85SYohann switch (eval_mode) { 268241a4b83SYohann case CEED_EVAL_NONE: 269241a4b83SYohann break; 270241a4b83SYohann case CEED_EVAL_INTERP: 271e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 2729e201c85SYohann data->B.inputs[i] = basis_data->d_interp_1d; 2739e201c85SYohann code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 2749e201c85SYohann code << " loadMatrix<P_in_"<<i<<",Q_1d>(data, B.inputs["<<i<<"], s_B_in_"<<i<<");\n"; 275241a4b83SYohann break; 276241a4b83SYohann case CEED_EVAL_GRAD: 277e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 2789e201c85SYohann data->B.inputs[i] = basis_data->d_interp_1d; 2799e201c85SYohann code << " __shared__ CeedScalar s_B_in_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 2809e201c85SYohann code << " loadMatrix<P_in_"<<i<<",Q_1d>(data, B.inputs["<<i<<"], s_B_in_"<<i<<");\n"; 2819e201c85SYohann if (use_collograd_parallelization) { 2829e201c85SYohann data->G.inputs[i] = basis_data->d_collo_grad_1d; 2839e201c85SYohann code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q_1d*Q_1d<<"];\n"; 2849e201c85SYohann code << " loadMatrix<Q_1d,Q_1d>(data, G.inputs["<<i<<"], s_G_in_"<<i<<");\n"; 285ac421f39SYohann } else { 2869e201c85SYohann bool has_collo_grad = !!basis_data->d_collo_grad_1d; 2879e201c85SYohann data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 2889e201c85SYohann code << " __shared__ CeedScalar s_G_in_"<<i<<"["<<Q_1d*(has_collo_grad?Q_1d:P_1d)<<"];\n"; 2899e201c85SYohann code << " loadMatrix<"<<(has_collo_grad?"Q_1d":("P_in_"+std::to_string(i)))<<",Q_1d>(data, G.inputs["<<i<<"], s_G_in_"<<i<<");\n"; 290ac421f39SYohann } 291241a4b83SYohann break; 292241a4b83SYohann case CEED_EVAL_WEIGHT: 293241a4b83SYohann break; // No action 294241a4b83SYohann case CEED_EVAL_DIV: 295241a4b83SYohann break; // TODO: Not implemented 296241a4b83SYohann case CEED_EVAL_CURL: 297241a4b83SYohann break; // TODO: Not implemented 298241a4b83SYohann } 299241a4b83SYohann } 300920dcdc4Sjeremylt 301818e0025SJeremy L Thompson code << "\n // -- Output field constants and basis data --\n"; 3029e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 303818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 3049e201c85SYohann // Get elem_size, eval_mode, num_comp 3059e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict); 306e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3079e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 308e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3099e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 310e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3119e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 312e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 313920dcdc4Sjeremylt 314920dcdc4Sjeremylt // Set field constants 3159e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 316920dcdc4Sjeremylt if (basis != CEED_BASIS_COLLOCATED) { 3179e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 3189e201c85SYohann code << " const CeedInt P_out_"<<i<<" = "<<P_1d<<";\n"; 319920dcdc4Sjeremylt } else { 3209e201c85SYohann code << " const CeedInt P_out_"<<i<<" = "<<Q_1d<<";\n"; 321920dcdc4Sjeremylt } 3229e201c85SYohann code << " const CeedInt num_comp_out_"<<i<<" = "<<num_comp<<";\n"; 323920dcdc4Sjeremylt 324920dcdc4Sjeremylt // Load basis data 3259e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 3269e201c85SYohann switch (eval_mode) { 327920dcdc4Sjeremylt case CEED_EVAL_NONE: 328920dcdc4Sjeremylt break; // No action 329920dcdc4Sjeremylt case CEED_EVAL_INTERP: 330e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 3319e201c85SYohann data->B.outputs[i] = basis_data->d_interp_1d; 3329e201c85SYohann code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 3339e201c85SYohann code << " loadMatrix<P_out_"<<i<<",Q_1d>(data, B.outputs["<<i<<"], s_B_out_"<<i<<");\n"; 334241a4b83SYohann break; 335241a4b83SYohann case CEED_EVAL_GRAD: 336e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 3379e201c85SYohann data->B.outputs[i] = basis_data->d_interp_1d; 3389e201c85SYohann code << " __shared__ CeedScalar s_B_out_"<<i<<"["<<P_1d*Q_1d<<"];\n"; 3399e201c85SYohann code << " loadMatrix<P_out_"<<i<<",Q_1d>(data, B.outputs["<<i<<"], s_B_out_"<<i<<");\n"; 3409e201c85SYohann if (use_collograd_parallelization) { 3419e201c85SYohann data->G.outputs[i] = basis_data->d_collo_grad_1d; 3429e201c85SYohann code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q_1d*Q_1d<<"];\n"; 3439e201c85SYohann code << " loadMatrix<Q_1d,Q_1d>(data, G.outputs["<<i<<"], s_G_out_"<<i<<");\n"; 344ac421f39SYohann } else { 3459e201c85SYohann bool has_collo_grad = !!basis_data->d_collo_grad_1d; 3469e201c85SYohann data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; 3479e201c85SYohann code << " __shared__ CeedScalar s_G_out_"<<i<<"["<<Q_1d*(has_collo_grad?Q_1d:P_1d)<<"];\n"; 3489e201c85SYohann code << " loadMatrix<"<<(has_collo_grad?"Q_1d":("P_out_"+std::to_string(i)))<<",Q_1d>(data, G.outputs["<<i<<"], s_G_out_"<<i<<");\n"; 349ac421f39SYohann } 350241a4b83SYohann break; 351e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 352241a4b83SYohann case CEED_EVAL_WEIGHT: { 353241a4b83SYohann Ceed ceed; 354e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 355e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 356241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 357241a4b83SYohann break; // Should not occur 358241a4b83SYohann } 359241a4b83SYohann case CEED_EVAL_DIV: 360241a4b83SYohann break; // TODO: Not implemented 361241a4b83SYohann case CEED_EVAL_CURL: 362241a4b83SYohann break; // TODO: Not implemented 363e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 364241a4b83SYohann } 365241a4b83SYohann } 366818e0025SJeremy L Thompson code << "\n // -- Element loop --\n"; 367ac421f39SYohann code << " __syncthreads();\n"; 3689e201c85SYohann code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n"; 369241a4b83SYohann // Input basis apply if needed 370ac421f39SYohann // Generate the correct eval mode code for each input 371818e0025SJeremy L Thompson code << " // -- Input field restrictions and basis actions --\n"; 3729e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 373818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 3749e201c85SYohann // Get elem_size, eval_mode, num_comp 3759e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); 376e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3779e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 378e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3799e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 380e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3819e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 382e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 383920dcdc4Sjeremylt 3849e201c85SYohann // TODO: put in a function? 385920dcdc4Sjeremylt // Restriction 3869e201c85SYohann if (eval_mode != CEED_EVAL_WEIGHT && 3879e201c85SYohann !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) { 3889e201c85SYohann code << " CeedScalar r_u_"<<i<<"[num_comp_in_"<<i<<"*P_in_"<<i<<"];\n"; 3899a2291e3SJeremy L Thompson 3909e201c85SYohann bool is_strided; 3919e201c85SYohann ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 3929e201c85SYohann if (!is_strided) { 3935c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 394e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 3955c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 3969e201c85SYohann CeedInt comp_stride; 3979e201c85SYohann ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 3989e201c85SYohann code << " // CompStride: "<<comp_stride<<"\n"; 399e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 4009e201c85SYohann data->indices.inputs[i] = restr_data->d_ind; 4019e201c85SYohann code << " readDofsOffset"<<dim<<"d<num_comp_in_"<<i<<", "<<comp_stride<<", P_in_"<<i<<">(data, lsize_in_"<<i<<", elem, indices.inputs["<<i<<"], d_u_"<<i<<", r_u_"<<i<<");\n"; 402ccedf6b0Sjeremylt } else { 403920dcdc4Sjeremylt bool backendstrides; 404fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 405e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 4069e201c85SYohann CeedInt num_elem; 4079e201c85SYohann ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 408e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 4099e201c85SYohann CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 410920dcdc4Sjeremylt if (!backendstrides) { 411920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 412e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 413ccedf6b0Sjeremylt } 414920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 4159e201c85SYohann code << " readDofsStrided"<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, d_u_"<<i<<", r_u_"<<i<<");\n"; 416920dcdc4Sjeremylt } 417920dcdc4Sjeremylt } 418920dcdc4Sjeremylt 4199e201c85SYohann // TODO: put in a function? 420920dcdc4Sjeremylt // Basis action 4219e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 4229e201c85SYohann switch (eval_mode) { 423920dcdc4Sjeremylt case CEED_EVAL_NONE: 4249e201c85SYohann if (!use_collograd_parallelization) { 4259e201c85SYohann code << " CeedScalar* r_t_"<<i<<" = r_u_"<<i<<";\n"; 426920dcdc4Sjeremylt } 427920dcdc4Sjeremylt break; 428920dcdc4Sjeremylt case CEED_EVAL_INTERP: 4299e201c85SYohann code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*Q_1d];\n"; 4309e201c85SYohann code << " Interp"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", r_t_"<<i<<");\n"; 431241a4b83SYohann break; 432241a4b83SYohann case CEED_EVAL_GRAD: 4339e201c85SYohann if (use_collograd_parallelization) { 4349e201c85SYohann code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*Q_1d];\n"; 4359e201c85SYohann code << " Interp"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", r_t_"<<i<<");\n"; 436ac421f39SYohann } else { 4379e201c85SYohann CeedInt P_1d; 4389e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 4399e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 4409e201c85SYohann code << " CeedScalar r_t_"<<i<<"[num_comp_in_"<<i<<"*dim*Q_1d];\n"; 4419e201c85SYohann code << " Grad"<<(dim>1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<<dim<<"d<num_comp_in_"<<i<<",P_in_"<<i<<",Q_1d>(data, r_u_"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t_"<<i<<");\n"; 442ac421f39SYohann } 443241a4b83SYohann break; 444241a4b83SYohann case CEED_EVAL_WEIGHT: 4459e201c85SYohann code << " CeedScalar r_t_"<<i<<"[Q_1d];\n"; 4469e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_input_fields[i], &basis); CeedChkBackend(ierr); 447e15f9bd0SJeremy L Thompson ierr = CeedBasisGetData(basis, &basis_data); CeedChkBackend(ierr); 448437930d1SJeremy L Thompson data->W = basis_data->d_q_weight_1d; 4499e201c85SYohann code << " Weight"<<(dim>1?"Tensor":"")<<dim<<"d<Q_1d>(data, W, r_t_"<<i<<");\n"; 450241a4b83SYohann break; // No action 451241a4b83SYohann case CEED_EVAL_DIV: 452241a4b83SYohann break; // TODO: Not implemented 453241a4b83SYohann case CEED_EVAL_CURL: 454241a4b83SYohann break; // TODO: Not implemented 455241a4b83SYohann } 456241a4b83SYohann } 457ac421f39SYohann 4589e201c85SYohann // TODO: put in a function + separate colograd logic 459241a4b83SYohann // Q function 460818e0025SJeremy L Thompson code << "\n // -- Output field setup --\n"; 4619e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 462818e0025SJeremy L Thompson code << "\n // ---- Output field "<<i<<" ----\n"; 4639e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 464e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 4659e201c85SYohann if (eval_mode==CEED_EVAL_GRAD) 466241a4b83SYohann { 4679e201c85SYohann if (use_collograd_parallelization) { 468ac421f39SYohann //Accumulator for gradient slices 4699e201c85SYohann code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*Q_1d];\n"; 4709e201c85SYohann code << " for (CeedInt i = 0; i < num_comp_out_"<<i<<"; i++) {\n"; 4719e201c85SYohann code << " for (CeedInt j = 0; j < Q_1d; ++j) {\n"; 4729e201c85SYohann code << " r_tt_"<<i<<"[j + i*Q_1d] = 0.0;\n"; 473ac421f39SYohann code << " }\n"; 474ac421f39SYohann code << " }\n"; 475ac421f39SYohann } else { 4769e201c85SYohann code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*dim*Q_1d];\n"; 477241a4b83SYohann } 478ac421f39SYohann } 4799e201c85SYohann if (eval_mode==CEED_EVAL_NONE || eval_mode==CEED_EVAL_INTERP) 480241a4b83SYohann { 4819e201c85SYohann code << " CeedScalar r_tt_"<<i<<"[num_comp_out_"<<i<<"*Q_1d];\n"; 482241a4b83SYohann } 483241a4b83SYohann } 484ac421f39SYohann // We treat quadrature points per slice in 3d to save registers 4859e201c85SYohann if (use_collograd_parallelization) { 4869e201c85SYohann code << "\n // Note: Using planes of 3D elements\n"; 487ac421f39SYohann code << "#pragma unroll\n"; 4889e201c85SYohann code << " for (CeedInt q = 0; q < Q_1d; q++) {\n"; 489818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 4909e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 491818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 4929e201c85SYohann // Get elem_size, eval_mode, num_comp 4939e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode); 494e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 495ac421f39SYohann // Basis action 4969e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 4979e201c85SYohann switch (eval_mode) { 498ac421f39SYohann case CEED_EVAL_NONE: 4999e201c85SYohann code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"];\n"; 5009a2291e3SJeremy L Thompson 5019e201c85SYohann bool is_strided; 5029e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict); CeedChkBackend(ierr); 5039e201c85SYohann ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 5049e201c85SYohann if (!is_strided) { 5055c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 506e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 5075c7b696cSjeremylt code << " const CeedInt lsize_in_"<<i<<" = "<<lsize<<";\n"; 5089e201c85SYohann CeedInt comp_stride; 5099e201c85SYohann ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 5109e201c85SYohann code << " // CompStride: "<<comp_stride<<"\n"; 511e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 5129e201c85SYohann data->indices.inputs[i] = restr_data->d_ind; 5139e201c85SYohann code << " readSliceQuadsOffset"<<"3d<num_comp_in_"<<i<<", "<<comp_stride<<", Q_1d>(data, lsize_in_"<<i<<", elem, q, indices.inputs["<<i<<"], d_u_"<<i<<", r_q_"<<i<<");\n"; 514920dcdc4Sjeremylt } else { 5159e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); CeedChkBackend(ierr); 516920dcdc4Sjeremylt bool backendstrides; 517fd364f38SJeremy L Thompson ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &backendstrides); 518e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 5199e201c85SYohann CeedInt num_elem; 5209e201c85SYohann ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 521e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 5229e201c85SYohann CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 523920dcdc4Sjeremylt if (!backendstrides) { 524920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 525e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 526920dcdc4Sjeremylt } 527920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 5289e201c85SYohann code << " readSliceQuadsStrided"<<"3d<num_comp_in_"<<i<<",Q_1d"","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, q, d_u_"<<i<<", r_q_"<<i<<");\n"; 529920dcdc4Sjeremylt } 530ac421f39SYohann break; 531ac421f39SYohann case CEED_EVAL_INTERP: 5329e201c85SYohann code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"];\n"; 5339e201c85SYohann code << " for (CeedInt j = 0; j < num_comp_in_"<<i<<" ; ++j) {\n"; 5349e201c85SYohann code << " r_q_"<<i<<"[j] = r_t_"<<i<<"[q + j*Q_1d];\n"; 535ac421f39SYohann code << " }\n"; 536ac421f39SYohann break; 537ac421f39SYohann case CEED_EVAL_GRAD: 5389e201c85SYohann code << " CeedScalar r_q_"<<i<<"[num_comp_in_"<<i<<"*dim];\n"; 5399e201c85SYohann code << " gradCollo3d<num_comp_in_"<<i<<",Q_1d>(data, q, r_t_"<<i<<", s_G_in_"<<i<<", r_q_"<<i<<");\n"; 540ac421f39SYohann break; 541ac421f39SYohann case CEED_EVAL_WEIGHT: 5429e201c85SYohann code << " CeedScalar r_q_"<<i<<"[1];\n"; 5439e201c85SYohann code << " r_q_"<<i<<"[0] = r_t_"<<i<<"[q];\n"; 544ac421f39SYohann break; // No action 545ac421f39SYohann case CEED_EVAL_DIV: 546ac421f39SYohann break; // TODO: Not implemented 547ac421f39SYohann case CEED_EVAL_CURL: 548ac421f39SYohann break; // TODO: Not implemented 549ac421f39SYohann } 550ac421f39SYohann } 551818e0025SJeremy L Thompson code << "\n // -- Output fields --\n"; 5529e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 553818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 5549e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 555e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 556ac421f39SYohann // Basis action 5579e201c85SYohann switch (eval_mode) { 558ac421f39SYohann case CEED_EVAL_NONE: 5599e201c85SYohann code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"];\n"; 560ac421f39SYohann break; // No action 561ac421f39SYohann case CEED_EVAL_INTERP: 5629e201c85SYohann code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"];\n"; 563ac421f39SYohann break; 564ac421f39SYohann case CEED_EVAL_GRAD: 5659e201c85SYohann code << " CeedScalar r_qq_"<<i<<"[num_comp_out_"<<i<<"*dim];\n"; 566ac421f39SYohann break; 567ac421f39SYohann case CEED_EVAL_WEIGHT: 568ac421f39SYohann break; // Should not occur 569ac421f39SYohann case CEED_EVAL_DIV: 570ac421f39SYohann break; // TODO: Not implemented 571ac421f39SYohann case CEED_EVAL_CURL: 572ac421f39SYohann break; // TODO: Not implemented 573ac421f39SYohann } 574ac421f39SYohann } 575ac421f39SYohann } else { 5769e201c85SYohann code << "\n // Note: Using full elements\n"; 577818e0025SJeremy L Thompson code << " // -- Input fields --\n"; 5789e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 579818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 5809e201c85SYohann code << " CeedScalar* r_q_"<<i<<" = r_t_"<<i<<";\n"; 581ac421f39SYohann } 582818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 5839e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 584818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 5859e201c85SYohann code << " CeedScalar* r_qq_"<<i<<" = r_tt_"<<i<<";\n"; 586ac421f39SYohann } 587ac421f39SYohann } 588818e0025SJeremy L Thompson code << "\n // -- QFunction Inputs and outputs --\n"; 5899e201c85SYohann code << " CeedScalar* in["<<num_input_fields<<"];\n"; 5909e201c85SYohann for (CeedInt i = 0; i < num_input_fields; i++) { 591818e0025SJeremy L Thompson code << " // ---- Input field "<<i<<" ----\n"; 5929e201c85SYohann code << " in["<<i<<"] = r_q_"<<i<<";\n"; 5934d537eeaSYohann } 5949e201c85SYohann code << " CeedScalar* out["<<num_output_fields<<"];\n"; 5959e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 596818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 5979e201c85SYohann code << " out["<<i<<"] = r_qq_"<<i<<";\n"; 5984d537eeaSYohann } 599818e0025SJeremy L Thompson code << "\n // -- Apply QFunction --\n"; 6009e201c85SYohann code << " "<<q_function_name<<"(ctx, "; 6019e201c85SYohann if (dim != 3 || use_collograd_parallelization) { 602ac421f39SYohann code << "1"; 603ac421f39SYohann } else { 6049e201c85SYohann code << "Q_1d"; 605ac421f39SYohann } 606ac421f39SYohann code << ", in, out);\n"; 6079e201c85SYohann if (use_collograd_parallelization) { 608818e0025SJeremy L Thompson code << " // -- Output fields --\n"; 6099e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 610818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 6119e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 612e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 613ac421f39SYohann // Basis action 6149e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 6159e201c85SYohann switch (eval_mode) { 616ac421f39SYohann case CEED_EVAL_NONE: 6179e201c85SYohann code << " for (CeedInt j = 0; j < num_comp_out_"<<i<<" ; ++j) {\n"; 6189e201c85SYohann code << " r_tt_"<<i<<"[q + j*Q_1d] = r_qq_"<<i<<"[j];\n"; 619ac421f39SYohann code << " }\n"; 620ac421f39SYohann break; // No action 621ac421f39SYohann case CEED_EVAL_INTERP: 6229e201c85SYohann code << " for (CeedInt j = 0; j < num_comp_out_"<<i<<" ; ++j) {\n"; 6239e201c85SYohann code << " r_tt_"<<i<<"[q + j*Q_1d] = r_qq_"<<i<<"[j];\n"; 624ac421f39SYohann code << " }\n"; 625ac421f39SYohann break; 626ac421f39SYohann case CEED_EVAL_GRAD: 6279e201c85SYohann code << " gradColloTranspose3d<num_comp_out_"<<i<<",Q_1d>(data, q, r_qq_"<<i<<", s_G_out_"<<i<<", r_tt_"<<i<<");\n"; 628ac421f39SYohann break; 629ac421f39SYohann case CEED_EVAL_WEIGHT: 630ac421f39SYohann break; // Should not occur 631ac421f39SYohann case CEED_EVAL_DIV: 632ac421f39SYohann break; // TODO: Not implemented 633ac421f39SYohann case CEED_EVAL_CURL: 634ac421f39SYohann break; // TODO: Not implemented 635ac421f39SYohann } 636ac421f39SYohann } 637ac421f39SYohann code << " }\n"; 638ac421f39SYohann } 639241a4b83SYohann 640241a4b83SYohann // Output basis apply if needed 641ac421f39SYohann // Generate the correct eval mode code for each output 642818e0025SJeremy L Thompson code << "\n // -- Output field basis action and restrictions --\n"; 6439e201c85SYohann for (CeedInt i = 0; i < num_output_fields; i++) { 644818e0025SJeremy L Thompson code << " // ---- Output field "<<i<<" ----\n"; 6459e201c85SYohann // Get elem_size, eval_mode, num_comp 6469e201c85SYohann ierr = CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict); 647e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6489e201c85SYohann ierr = CeedElemRestrictionGetElementSize(Erestrict, &elem_size); 649e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6509e201c85SYohann ierr = CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode); 651e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6529e201c85SYohann ierr = CeedElemRestrictionGetNumComponents(Erestrict, &num_comp); 653e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6549e201c85SYohann // TODO put in a function 655241a4b83SYohann // Basis action 6569e201c85SYohann code << " // EvalMode: "<<CeedEvalModes[eval_mode]<<"\n"; 6579e201c85SYohann switch (eval_mode) { 658241a4b83SYohann case CEED_EVAL_NONE: 6599e201c85SYohann code << " CeedScalar* r_v_"<<i<<" = r_tt_"<<i<<";\n"; 660241a4b83SYohann break; // No action 661241a4b83SYohann case CEED_EVAL_INTERP: 6629e201c85SYohann code << " CeedScalar r_v_"<<i<<"[num_comp_out_"<<i<<"*P_out_"<<i<<"];\n"; 6639e201c85SYohann code << " InterpTranspose"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", r_v_"<<i<<");\n"; 664241a4b83SYohann break; 665241a4b83SYohann case CEED_EVAL_GRAD: 6669e201c85SYohann code << " CeedScalar r_v_"<<i<<"[num_comp_out_"<<i<<"*P_out_"<<i<<"];\n"; 6679e201c85SYohann if (use_collograd_parallelization) { 6689e201c85SYohann code << " InterpTranspose"<<(dim>1?"Tensor":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", r_v_"<<i<<");\n"; 669ac421f39SYohann } else { 6709e201c85SYohann CeedInt P_1d; 6719e201c85SYohann ierr = CeedOperatorFieldGetBasis(op_output_fields[i], &basis); CeedChkBackend(ierr); 6729e201c85SYohann ierr = CeedBasisGetNumNodes1D(basis, &P_1d); CeedChkBackend(ierr); 6739e201c85SYohann code << " GradTranspose"<<(dim>1?"Tensor":"")<<(dim==3&&Q_1d>=P_1d?"Collocated":"")<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<",Q_1d>(data, r_tt_"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v_"<<i<<");\n"; 674ac421f39SYohann } 675241a4b83SYohann break; 676e9f4dca0SJeremy L Thompson // LCOV_EXCL_START 677241a4b83SYohann case CEED_EVAL_WEIGHT: { 678241a4b83SYohann Ceed ceed; 679e15f9bd0SJeremy L Thompson ierr = CeedOperatorGetCeed(op, &ceed); CeedChkBackend(ierr); 680e15f9bd0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 681241a4b83SYohann "CEED_EVAL_WEIGHT cannot be an output evaluation mode"); 682241a4b83SYohann break; // Should not occur 683241a4b83SYohann } 684241a4b83SYohann case CEED_EVAL_DIV: 685241a4b83SYohann break; // TODO: Not implemented 686241a4b83SYohann case CEED_EVAL_CURL: 687241a4b83SYohann break; // TODO: Not implemented 688e9f4dca0SJeremy L Thompson // LCOV_EXCL_STOP 689241a4b83SYohann } 6909e201c85SYohann // TODO put in a function 691920dcdc4Sjeremylt // Restriction 6929e201c85SYohann bool is_strided; 6939e201c85SYohann ierr = CeedElemRestrictionIsStrided(Erestrict, &is_strided); CeedChkBackend(ierr); 6949e201c85SYohann if (!is_strided) { 6955c7b696cSjeremylt ierr = CeedElemRestrictionGetLVectorSize(Erestrict, &lsize); 696e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 6975c7b696cSjeremylt code << " const CeedInt lsize_out_"<<i<<" = "<<lsize<<";\n"; 6989e201c85SYohann CeedInt comp_stride; 6999e201c85SYohann ierr = CeedElemRestrictionGetCompStride(Erestrict, &comp_stride); CeedChkBackend(ierr); 7009e201c85SYohann code << " // CompStride: "<<comp_stride<<"\n"; 701e15f9bd0SJeremy L Thompson ierr = CeedElemRestrictionGetData(Erestrict, &restr_data); CeedChkBackend(ierr); 7029e201c85SYohann data->indices.outputs[i] = restr_data->d_ind; 7039e201c85SYohann code << " writeDofsOffset"<<dim<<"d<num_comp_out_"<<i<<", "<<comp_stride<<", P_out_"<<i<<">(data, lsize_out_"<<i<<", elem, indices.outputs["<<i<<"], r_v_"<<i<<", d_v_"<<i<<");\n"; 704920dcdc4Sjeremylt } else { 7059e201c85SYohann bool has_backend_strides; 7069e201c85SYohann ierr = CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides); 707e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 7089e201c85SYohann CeedInt num_elem; 7099e201c85SYohann ierr = CeedElemRestrictionGetNumElements(Erestrict, &num_elem); 710e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 7119e201c85SYohann CeedInt strides[3] = {1, elem_size*num_elem, elem_size}; 7129e201c85SYohann if (!has_backend_strides) { 713920dcdc4Sjeremylt ierr = CeedElemRestrictionGetStrides(Erestrict, &strides); 714e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 715920dcdc4Sjeremylt } 716920dcdc4Sjeremylt code << " // Strides: {"<<strides[0]<<", "<<strides[1]<<", "<<strides[2]<<"}\n"; 7179e201c85SYohann code << " writeDofsStrided"<<dim<<"d<num_comp_out_"<<i<<",P_out_"<<i<<","<<strides[0]<<","<<strides[1]<<","<<strides[2]<<">(data, elem, r_v_"<<i<<", d_v_"<<i<<");\n"; 718920dcdc4Sjeremylt } 719241a4b83SYohann } 720241a4b83SYohann 721241a4b83SYohann code << " }\n"; 722818e0025SJeremy L Thompson code << "}\n"; 723818e0025SJeremy L Thompson code << "// -----------------------------------------------------------------------------\n\n"; 724241a4b83SYohann 725ab213215SJeremy L Thompson // View kernel for debugging 72646dc0734SJeremy L Thompson CeedDebug256(ceed, 2, "Generated Operator Kernels:\n"); 7273f21f6b1SJeremy L Thompson CeedDebug(ceed, code.str().c_str()); 728241a4b83SYohann 72918d499f1SYohann ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 1, 7309e201c85SYohann "T_1D", CeedIntMax(Q_1d, data->max_P_1d)); 731e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 7329e201c85SYohann ierr = CeedGetKernelCuda(ceed, data->module, operator_name.c_str(), &data->op); 733e15f9bd0SJeremy L Thompson CeedChkBackend(ierr); 734241a4b83SYohann 735e15f9bd0SJeremy L Thompson ierr = CeedOperatorSetSetupDone(op); CeedChkBackend(ierr); 736e15f9bd0SJeremy L Thompson return CEED_ERROR_SUCCESS; 737241a4b83SYohann } 738ab213215SJeremy L Thompson //------------------------------------------------------------------------------ 739