xref: /libCEED/rust/libceed-sys/c-src/backends/sycl-gen/ceed-sycl-gen-operator-build.sycl.cpp (revision 356036fa84f714fa73ef64c9a80ce2028dde816f)
16ca0f394SUmesh Unnikrishnan // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
26ca0f394SUmesh Unnikrishnan // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
36ca0f394SUmesh Unnikrishnan //
46ca0f394SUmesh Unnikrishnan // SPDX-License-Identifier: BSD-2-Clause
56ca0f394SUmesh Unnikrishnan //
66ca0f394SUmesh Unnikrishnan // This file is part of CEED:  http://github.com/ceed
76ca0f394SUmesh Unnikrishnan 
86ca0f394SUmesh Unnikrishnan #define CEED_DEBUG_COLOR 12
96ca0f394SUmesh Unnikrishnan 
106ca0f394SUmesh Unnikrishnan #include <ceed/backend.h>
116ca0f394SUmesh Unnikrishnan #include <ceed/ceed.h>
126ca0f394SUmesh Unnikrishnan #include <ceed/jit-source/sycl/sycl-types.h>
136ca0f394SUmesh Unnikrishnan #include <ceed/jit-tools.h>
146ca0f394SUmesh Unnikrishnan 
156ca0f394SUmesh Unnikrishnan #include <iostream>
166ca0f394SUmesh Unnikrishnan #include <sstream>
176ca0f394SUmesh Unnikrishnan #include <string>
186ca0f394SUmesh Unnikrishnan #include <string_view>
196ca0f394SUmesh Unnikrishnan #include <vector>
206ca0f394SUmesh Unnikrishnan 
216ca0f394SUmesh Unnikrishnan #include "../sycl-ref/ceed-sycl-ref.hpp"
226ca0f394SUmesh Unnikrishnan #include "../sycl-shared/ceed-sycl-shared.hpp"
236ca0f394SUmesh Unnikrishnan #include "../sycl/ceed-sycl-compile.hpp"
246ca0f394SUmesh Unnikrishnan 
256ca0f394SUmesh Unnikrishnan #include "ceed-sycl-gen.hpp"
266ca0f394SUmesh Unnikrishnan 
276ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
286ca0f394SUmesh Unnikrishnan // Calculate the block size used for launching the operator kernel
296ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
306ca0f394SUmesh Unnikrishnan extern "C" int BlockGridCalculate_Sycl_gen(const CeedInt dim, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
316ca0f394SUmesh Unnikrishnan   const CeedInt thread1d = CeedIntMax(Q_1d, P_1d);
326ca0f394SUmesh Unnikrishnan   if (dim == 1) {
336ca0f394SUmesh Unnikrishnan     CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64;
346ca0f394SUmesh Unnikrishnan     elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
356ca0f394SUmesh Unnikrishnan     block_sizes[0]          = thread1d;
366ca0f394SUmesh Unnikrishnan     block_sizes[1]          = 1;
376ca0f394SUmesh Unnikrishnan     block_sizes[2]          = elems_per_block;
386ca0f394SUmesh Unnikrishnan   } else if (dim == 2) {
396ca0f394SUmesh Unnikrishnan     const CeedInt elems_per_block = thread1d < 4 ? 16 : 2;
406ca0f394SUmesh Unnikrishnan     block_sizes[0]                = thread1d;
416ca0f394SUmesh Unnikrishnan     block_sizes[1]                = thread1d;
426ca0f394SUmesh Unnikrishnan     block_sizes[2]                = elems_per_block;
436ca0f394SUmesh Unnikrishnan   } else if (dim == 3) {
446ca0f394SUmesh Unnikrishnan     const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1);
456ca0f394SUmesh Unnikrishnan     block_sizes[0]                = thread1d;
466ca0f394SUmesh Unnikrishnan     block_sizes[1]                = thread1d;
476ca0f394SUmesh Unnikrishnan     block_sizes[2]                = elems_per_block;
486ca0f394SUmesh Unnikrishnan   }
496ca0f394SUmesh Unnikrishnan   return CEED_ERROR_SUCCESS;
506ca0f394SUmesh Unnikrishnan }
516ca0f394SUmesh Unnikrishnan 
526ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
536ca0f394SUmesh Unnikrishnan // Build single operator kernel
546ca0f394SUmesh Unnikrishnan // - [ ] Check arguments to device functions reudsed from sycl-shared-basis are correct
556ca0f394SUmesh Unnikrishnan // - [ ] Do kernel jitting!
566ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
576ca0f394SUmesh Unnikrishnan extern "C" int CeedOperatorBuildKernel_Sycl_gen(CeedOperator op) {
586ca0f394SUmesh Unnikrishnan   bool is_setup_done;
596ca0f394SUmesh Unnikrishnan   CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
606ca0f394SUmesh Unnikrishnan   if (is_setup_done) return CEED_ERROR_SUCCESS;
616ca0f394SUmesh Unnikrishnan 
626ca0f394SUmesh Unnikrishnan   Ceed ceed;
636ca0f394SUmesh Unnikrishnan   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
646ca0f394SUmesh Unnikrishnan   Ceed_Sycl *sycl_data;
656ca0f394SUmesh Unnikrishnan   CeedCallBackend(CeedGetData(ceed, &sycl_data));
666ca0f394SUmesh Unnikrishnan 
676ca0f394SUmesh Unnikrishnan   CeedOperator_Sycl_gen *impl;
686ca0f394SUmesh Unnikrishnan   CeedCallBackend(CeedOperatorGetData(op, &impl));
696ca0f394SUmesh Unnikrishnan   Fields_Sycl             h_B, h_G;
706ca0f394SUmesh Unnikrishnan   FieldsInt_Sycl          h_indices;
716ca0f394SUmesh Unnikrishnan   CeedQFunction           qf;
726ca0f394SUmesh Unnikrishnan   CeedQFunction_Sycl_gen *qf_impl;
736ca0f394SUmesh Unnikrishnan   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
746ca0f394SUmesh Unnikrishnan   CeedCallBackend(CeedQFunctionGetData(qf, &qf_impl));
756ca0f394SUmesh Unnikrishnan   CeedSize lsize;
766ca0f394SUmesh Unnikrishnan   CeedInt  Q, P_1d = 0, Q_1d = 0, elem_size, num_input_fields, num_output_fields, num_comp, dim = 1;
776ca0f394SUmesh Unnikrishnan   CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
786ca0f394SUmesh Unnikrishnan   Q_1d = Q;
796ca0f394SUmesh Unnikrishnan 
806ca0f394SUmesh Unnikrishnan   CeedOperatorField *op_input_fields, *op_output_fields;
816ca0f394SUmesh Unnikrishnan   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
826ca0f394SUmesh Unnikrishnan   CeedQFunctionField *qf_input_fields, *qf_output_fields;
836ca0f394SUmesh Unnikrishnan   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
846ca0f394SUmesh Unnikrishnan 
856ca0f394SUmesh Unnikrishnan   CeedEvalMode              eval_mode;
866ca0f394SUmesh Unnikrishnan   CeedBasis                 basis;
876ca0f394SUmesh Unnikrishnan   CeedBasis_Sycl_shared    *basis_impl;
886ca0f394SUmesh Unnikrishnan   CeedElemRestriction       Erestrict;
896ca0f394SUmesh Unnikrishnan   CeedElemRestriction_Sycl *restr_impl;
906ca0f394SUmesh Unnikrishnan 
916ca0f394SUmesh Unnikrishnan   // Check for restriction only identity operator
926ca0f394SUmesh Unnikrishnan   bool is_identity_qf;
936ca0f394SUmesh Unnikrishnan   CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
946ca0f394SUmesh Unnikrishnan   if (is_identity_qf) {
956ca0f394SUmesh Unnikrishnan     CeedEvalMode eval_mode_in, eval_mode_out;
966ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
976ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
986ca0f394SUmesh Unnikrishnan     if (eval_mode_in == CEED_EVAL_NONE && eval_mode_out == CEED_EVAL_NONE) {
996ca0f394SUmesh Unnikrishnan       // LCOV_EXCL_START
1006ca0f394SUmesh Unnikrishnan       return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement restriction only identity operators");
1016ca0f394SUmesh Unnikrishnan       // LCOV_EXCL_STOP
1026ca0f394SUmesh Unnikrishnan     }
1036ca0f394SUmesh Unnikrishnan   }
1046ca0f394SUmesh Unnikrishnan 
1056ca0f394SUmesh Unnikrishnan   std::ostringstream code;
1066ca0f394SUmesh Unnikrishnan   // TODO: generalize to accept different device functions?
1076ca0f394SUmesh Unnikrishnan   {
1086ca0f394SUmesh Unnikrishnan     char *tensor_basis_kernel_path, *tensor_basis_code;
1096ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-shared-basis-tensor-templates.h", &tensor_basis_kernel_path));
1106ca0f394SUmesh Unnikrishnan     CeedDebug256(ceed, 2, "----- Loading Tensor Basis Kernel Source -----\n");
1116ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedLoadSourceToBuffer(ceed, tensor_basis_kernel_path, &tensor_basis_code));
1126ca0f394SUmesh Unnikrishnan     code << tensor_basis_code;
1136ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedFree(&tensor_basis_kernel_path));
1146ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedFree(&tensor_basis_code));
1156ca0f394SUmesh Unnikrishnan   }
1166ca0f394SUmesh Unnikrishnan   {
1176ca0f394SUmesh Unnikrishnan     char *sycl_gen_template_path, *sycl_gen_template_source;
1186ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedGetJitAbsolutePath(ceed, "ceed/jit-source/sycl/sycl-gen-templates.h", &sycl_gen_template_path));
1196ca0f394SUmesh Unnikrishnan     CeedDebug256(ceed, 2, "----- Loading Sycl-Gen Template Source -----\n");
1206ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedLoadSourceToBuffer(ceed, sycl_gen_template_path, &sycl_gen_template_source));
1216ca0f394SUmesh Unnikrishnan     code << sycl_gen_template_source;
1226ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedFree(&sycl_gen_template_path));
1236ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedFree(&sycl_gen_template_source));
1246ca0f394SUmesh Unnikrishnan   }
1256ca0f394SUmesh Unnikrishnan 
1266ca0f394SUmesh Unnikrishnan   std::string_view  q_function_source(qf_impl->q_function_source);
1276ca0f394SUmesh Unnikrishnan   std::string_view  q_function_name(qf_impl->q_function_name);
1286ca0f394SUmesh Unnikrishnan   const std::string operator_name = "CeedKernelSyclGenOperator_" + std::string(q_function_name);
1296ca0f394SUmesh Unnikrishnan 
1306ca0f394SUmesh Unnikrishnan   // Find dim, P_1d, Q_1d
1316ca0f394SUmesh Unnikrishnan   impl->max_P_1d = 0;
1326ca0f394SUmesh Unnikrishnan   for (CeedInt i = 0; i < num_input_fields; i++) {
1336ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
134*356036faSJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
1356ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
1366ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1376ca0f394SUmesh Unnikrishnan 
1386ca0f394SUmesh Unnikrishnan       // Collect dim, P_1d, and Q_1d
1396ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedBasisGetDimension(basis, &dim));
1406ca0f394SUmesh Unnikrishnan       bool isTensor;
1416ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedBasisIsTensor(basis, &isTensor));
1426ca0f394SUmesh Unnikrishnan       if (isTensor) {
1436ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
1446ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
1456ca0f394SUmesh Unnikrishnan         if (P_1d > impl->max_P_1d) impl->max_P_1d = P_1d;
1466ca0f394SUmesh Unnikrishnan       } else {
1476ca0f394SUmesh Unnikrishnan         // LCOV_EXCL_START
1486ca0f394SUmesh Unnikrishnan         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
1496ca0f394SUmesh Unnikrishnan         // LCOV_EXCL_STOP
1506ca0f394SUmesh Unnikrishnan       }
1516ca0f394SUmesh Unnikrishnan     }
1526ca0f394SUmesh Unnikrishnan   }
1536ca0f394SUmesh Unnikrishnan   // Check output bases for Q_1d, dim as well
154*356036faSJeremy L Thompson   //   The only input basis might be CEED_BASIS_NONE
1556ca0f394SUmesh Unnikrishnan   for (CeedInt i = 0; i < num_output_fields; i++) {
1566ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1576ca0f394SUmesh Unnikrishnan 
158*356036faSJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
1596ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
1606ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1616ca0f394SUmesh Unnikrishnan 
1626ca0f394SUmesh Unnikrishnan       // Collect Q_1d
1636ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedBasisGetDimension(basis, &dim));
1646ca0f394SUmesh Unnikrishnan       bool isTensor;
1656ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedBasisIsTensor(basis, &isTensor));
1666ca0f394SUmesh Unnikrishnan       if (isTensor) {
1676ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
1686ca0f394SUmesh Unnikrishnan       } else {
1696ca0f394SUmesh Unnikrishnan         // LCOV_EXCL_START
1706ca0f394SUmesh Unnikrishnan         return CeedError(ceed, CEED_ERROR_BACKEND, "Backend does not implement operators with non-tensor basis");
1716ca0f394SUmesh Unnikrishnan         // LCOV_EXCL_STOP
1726ca0f394SUmesh Unnikrishnan       }
1736ca0f394SUmesh Unnikrishnan     }
1746ca0f394SUmesh Unnikrishnan   }
1756ca0f394SUmesh Unnikrishnan   impl->dim  = dim;
1766ca0f394SUmesh Unnikrishnan   impl->Q_1d = Q_1d;
1776ca0f394SUmesh Unnikrishnan 
1786ca0f394SUmesh Unnikrishnan   // Only use 3D collocated gradient parallelization strategy when gradient is computed
1796ca0f394SUmesh Unnikrishnan   // TODO: put in a function?
1806ca0f394SUmesh Unnikrishnan   bool use_collograd_parallelization = false;
1816ca0f394SUmesh Unnikrishnan   if (dim == 3) {
1826ca0f394SUmesh Unnikrishnan     bool was_grad_found = false;
1836ca0f394SUmesh Unnikrishnan     for (CeedInt i = 0; i < num_input_fields; i++) {
1846ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1856ca0f394SUmesh Unnikrishnan       if (eval_mode == CEED_EVAL_GRAD) {
1866ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1876ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
1881c66c397SJeremy L Thompson         use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
1896ca0f394SUmesh Unnikrishnan         was_grad_found                = true;
1906ca0f394SUmesh Unnikrishnan       }
1916ca0f394SUmesh Unnikrishnan     }
1926ca0f394SUmesh Unnikrishnan     for (CeedInt i = 0; i < num_output_fields; i++) {
1936ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1946ca0f394SUmesh Unnikrishnan       if (eval_mode == CEED_EVAL_GRAD) {
1956ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1966ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
1971c66c397SJeremy L Thompson         use_collograd_parallelization = basis_impl->d_collo_grad_1d && (was_grad_found ? use_collograd_parallelization : true);
1986ca0f394SUmesh Unnikrishnan         was_grad_found                = true;
1996ca0f394SUmesh Unnikrishnan       }
2006ca0f394SUmesh Unnikrishnan     }
2016ca0f394SUmesh Unnikrishnan   }
2026ca0f394SUmesh Unnikrishnan 
2036ca0f394SUmesh Unnikrishnan   CeedInt block_sizes[3];
2046ca0f394SUmesh Unnikrishnan   CeedCallBackend(BlockGridCalculate_Sycl_gen(dim, P_1d, Q_1d, block_sizes));
2056ca0f394SUmesh Unnikrishnan 
2066ca0f394SUmesh Unnikrishnan   // Define CEED_Q_VLA
2076ca0f394SUmesh Unnikrishnan   code << "\n#undef CEED_Q_VLA\n";
2086ca0f394SUmesh Unnikrishnan   if (dim != 3 || use_collograd_parallelization) {
2096ca0f394SUmesh Unnikrishnan     code << "#define CEED_Q_VLA 1\n\n";
2106ca0f394SUmesh Unnikrishnan   } else {
2116ca0f394SUmesh Unnikrishnan     code << "#define CEED_Q_VLA " << Q_1d << "\n\n";
2126ca0f394SUmesh Unnikrishnan   }
2136ca0f394SUmesh Unnikrishnan 
2146ca0f394SUmesh Unnikrishnan   // Determine subgroup size based on supported sizes : Default : 16 (if supported)
2156ca0f394SUmesh Unnikrishnan   std::vector allowed_sg_sizes  = sycl_data->sycl_device.get_info<sycl::info::device::sub_group_sizes>();
2166ca0f394SUmesh Unnikrishnan   CeedInt     sub_group_size_op = allowed_sg_sizes[allowed_sg_sizes.size() - 1];
2176ca0f394SUmesh Unnikrishnan   for (const auto &s : allowed_sg_sizes) {
2186ca0f394SUmesh Unnikrishnan     if (s == 16) {
2196ca0f394SUmesh Unnikrishnan       sub_group_size_op = s;
2206ca0f394SUmesh Unnikrishnan       break;
2216ca0f394SUmesh Unnikrishnan     }
2226ca0f394SUmesh Unnikrishnan   }
2236ca0f394SUmesh Unnikrishnan 
2246ca0f394SUmesh Unnikrishnan   code << q_function_source;
2256ca0f394SUmesh Unnikrishnan 
2266ca0f394SUmesh Unnikrishnan   // Kernel function
2276ca0f394SUmesh Unnikrishnan   code << "\n// -----------------------------------------------------------------------------\n";
2286ca0f394SUmesh Unnikrishnan   code << "__attribute__((reqd_work_group_size(GROUP_SIZE_X, GROUP_SIZE_Y, GROUP_SIZE_Z), intel_reqd_sub_group_size(" << sub_group_size_op << ")))\n";
2296ca0f394SUmesh Unnikrishnan   code << "kernel void " << operator_name << "(";
2306ca0f394SUmesh Unnikrishnan   code << "const CeedInt num_elem, ";
2316ca0f394SUmesh Unnikrishnan   code << "global void* ctx, ";
2326ca0f394SUmesh Unnikrishnan   code << "global const FieldsInt_Sycl* indices, ";
2336ca0f394SUmesh Unnikrishnan   code << "global Fields_Sycl* fields, ";
2346ca0f394SUmesh Unnikrishnan   code << "global const Fields_Sycl* B, ";
2356ca0f394SUmesh Unnikrishnan   code << "global const Fields_Sycl* G, ";
2366ca0f394SUmesh Unnikrishnan   code << "global const CeedScalar * restrict W";
2376ca0f394SUmesh Unnikrishnan   code << ") {\n";
2386ca0f394SUmesh Unnikrishnan 
2396ca0f394SUmesh Unnikrishnan   for (CeedInt i = 0; i < num_input_fields; i++) {
2406ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2416ca0f394SUmesh Unnikrishnan     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
2426ca0f394SUmesh Unnikrishnan       code << "  global const CeedScalar* d_u_" << i << " = fields->inputs[" << i << "];\n";
2436ca0f394SUmesh Unnikrishnan     }
2446ca0f394SUmesh Unnikrishnan   }
2456ca0f394SUmesh Unnikrishnan 
2466ca0f394SUmesh Unnikrishnan   for (CeedInt i = 0; i < num_output_fields; i++) {
2476ca0f394SUmesh Unnikrishnan     code << "  global CeedScalar* d_v_" << i << " = fields->outputs[" << i << "];\n";
2486ca0f394SUmesh Unnikrishnan   }
2496ca0f394SUmesh Unnikrishnan 
2506ca0f394SUmesh Unnikrishnan   // TODO: Convert these to defined constants to save on GRF
2516ca0f394SUmesh Unnikrishnan   code << "  const CeedInt DIM = " << dim << ";\n";
2526ca0f394SUmesh Unnikrishnan   code << "  const CeedInt Q_1D = " << Q_1d << ";\n";
2536ca0f394SUmesh Unnikrishnan 
2546ca0f394SUmesh Unnikrishnan   const CeedInt scratch_size = block_sizes[0] * block_sizes[1] * block_sizes[2];
2556ca0f394SUmesh Unnikrishnan   code << "  local CeedScalar scratch[" << scratch_size << "];\n";
2566ca0f394SUmesh Unnikrishnan   code << "  local CeedScalar * elem_scratch = scratch + get_local_id(2) * T_1D" << (dim > 1 ? "*T_1D" : "") << ";\n";
2576ca0f394SUmesh Unnikrishnan 
2586ca0f394SUmesh Unnikrishnan   code << "\n  // -- Input field constants and basis data --\n";
2596ca0f394SUmesh Unnikrishnan   // Initialize constants, and matrices B and G
2606ca0f394SUmesh Unnikrishnan   for (CeedInt i = 0; i < num_input_fields; i++) {
2616ca0f394SUmesh Unnikrishnan     code << "  // ---- Input field " << i << " ----\n";
2626ca0f394SUmesh Unnikrishnan     // Get elem_size, eval_mode, num_comp
2636ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict));
2646ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size));
2656ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2666ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp));
2676ca0f394SUmesh Unnikrishnan 
2686ca0f394SUmesh Unnikrishnan     // Set field constants
2696ca0f394SUmesh Unnikrishnan     if (eval_mode != CEED_EVAL_WEIGHT) {
2706ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
271*356036faSJeremy L Thompson       if (basis != CEED_BASIS_NONE) {
2726ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
2736ca0f394SUmesh Unnikrishnan         code << "  const CeedInt P_in_" << i << " = " << P_1d << ";\n";
2746ca0f394SUmesh Unnikrishnan       } else {
2756ca0f394SUmesh Unnikrishnan         code << "  const CeedInt P_in_" << i << " = " << Q_1d << ";\n";
2766ca0f394SUmesh Unnikrishnan       }
2776ca0f394SUmesh Unnikrishnan       code << "  const CeedInt num_comp_in_" << i << " = " << num_comp << ";\n";
2786ca0f394SUmesh Unnikrishnan     }
2796ca0f394SUmesh Unnikrishnan 
2806ca0f394SUmesh Unnikrishnan     // Load basis data
2816ca0f394SUmesh Unnikrishnan     code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
2826ca0f394SUmesh Unnikrishnan     switch (eval_mode) {
2836ca0f394SUmesh Unnikrishnan       case CEED_EVAL_NONE:
2846ca0f394SUmesh Unnikrishnan         break;
2856ca0f394SUmesh Unnikrishnan       case CEED_EVAL_INTERP:
2866ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
2876ca0f394SUmesh Unnikrishnan         h_B.inputs[i] = basis_impl->d_interp_1d;
2886ca0f394SUmesh Unnikrishnan         code << "  local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
2896ca0f394SUmesh Unnikrishnan         code << "  loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n";
2906ca0f394SUmesh Unnikrishnan         break;
2916ca0f394SUmesh Unnikrishnan       case CEED_EVAL_GRAD:
2926ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
2936ca0f394SUmesh Unnikrishnan         h_B.inputs[i] = basis_impl->d_interp_1d;
2946ca0f394SUmesh Unnikrishnan         code << "  local CeedScalar s_B_in_" << i << "[" << P_1d * Q_1d << "];\n";
2956ca0f394SUmesh Unnikrishnan         code << "  loadMatrix(P_in_" << i << "*Q_1D, B->inputs[" << i << "], s_B_in_" << i << ");\n";
2966ca0f394SUmesh Unnikrishnan         if (use_collograd_parallelization) {
2976ca0f394SUmesh Unnikrishnan           h_G.inputs[i] = basis_impl->d_collo_grad_1d;
2986ca0f394SUmesh Unnikrishnan           code << "  local CeedScalar s_G_in_" << i << "[" << Q_1d * Q_1d << "];\n";
2996ca0f394SUmesh Unnikrishnan           code << "  loadMatrix(Q_1D*Q_1D, G->inputs[" << i << "], s_G_in_" << i << ");\n";
3006ca0f394SUmesh Unnikrishnan         } else {
3011c66c397SJeremy L Thompson           bool has_collo_grad = basis_impl->d_collo_grad_1d;
3026ca0f394SUmesh Unnikrishnan           h_G.inputs[i]       = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d;
3036ca0f394SUmesh Unnikrishnan           code << "  local CeedScalar s_G_in_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
3046ca0f394SUmesh Unnikrishnan           code << "  loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_in_" + std::to_string(i))) << "*Q_1D, G->inputs[" << i << "], s_G_in_" << i
3056ca0f394SUmesh Unnikrishnan                << ");\n";
3066ca0f394SUmesh Unnikrishnan         }
3076ca0f394SUmesh Unnikrishnan         break;
3086ca0f394SUmesh Unnikrishnan       case CEED_EVAL_WEIGHT:
3096ca0f394SUmesh Unnikrishnan         break;  // No action
3106ca0f394SUmesh Unnikrishnan       case CEED_EVAL_DIV:
3116ca0f394SUmesh Unnikrishnan         break;  // TODO: Not implemented
3126ca0f394SUmesh Unnikrishnan       case CEED_EVAL_CURL:
3136ca0f394SUmesh Unnikrishnan         break;  // TODO: Not implemented
3146ca0f394SUmesh Unnikrishnan     }
3156ca0f394SUmesh Unnikrishnan   }
3166ca0f394SUmesh Unnikrishnan 
3176ca0f394SUmesh Unnikrishnan   code << "\n  // -- Output field constants and basis data --\n";
3186ca0f394SUmesh Unnikrishnan   for (CeedInt i = 0; i < num_output_fields; i++) {
3196ca0f394SUmesh Unnikrishnan     code << "  // ---- Output field " << i << " ----\n";
3206ca0f394SUmesh Unnikrishnan     // Get elem_size, eval_mode, num_comp
3216ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict));
3226ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size));
3236ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
3246ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp));
3256ca0f394SUmesh Unnikrishnan 
3266ca0f394SUmesh Unnikrishnan     // Set field constants
3276ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
328*356036faSJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
3296ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
3306ca0f394SUmesh Unnikrishnan       code << "  const CeedInt P_out_" << i << " = " << P_1d << ";\n";
3316ca0f394SUmesh Unnikrishnan     } else {
3326ca0f394SUmesh Unnikrishnan       code << "  const CeedInt P_out_" << i << " = " << Q_1d << ";\n";
3336ca0f394SUmesh Unnikrishnan     }
3346ca0f394SUmesh Unnikrishnan     code << "  const CeedInt num_comp_out_" << i << " = " << num_comp << ";\n";
3356ca0f394SUmesh Unnikrishnan 
3366ca0f394SUmesh Unnikrishnan     // Load basis data
3376ca0f394SUmesh Unnikrishnan     code << "  // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
3386ca0f394SUmesh Unnikrishnan     switch (eval_mode) {
3396ca0f394SUmesh Unnikrishnan       case CEED_EVAL_NONE:
3406ca0f394SUmesh Unnikrishnan         break;  // No action
3416ca0f394SUmesh Unnikrishnan       case CEED_EVAL_INTERP:
3426ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
3436ca0f394SUmesh Unnikrishnan         h_B.outputs[i] = basis_impl->d_interp_1d;
3446ca0f394SUmesh Unnikrishnan         code << "  local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
3456ca0f394SUmesh Unnikrishnan         code << "  loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n";
3466ca0f394SUmesh Unnikrishnan         break;
3476ca0f394SUmesh Unnikrishnan       case CEED_EVAL_GRAD:
3486ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
3496ca0f394SUmesh Unnikrishnan         h_B.outputs[i] = basis_impl->d_interp_1d;
3506ca0f394SUmesh Unnikrishnan         code << "  local CeedScalar s_B_out_" << i << "[" << P_1d * Q_1d << "];\n";
3516ca0f394SUmesh Unnikrishnan         code << "  loadMatrix(P_out_" << i << "*Q_1D, B->outputs[" << i << "], s_B_out_" << i << ");\n";
3526ca0f394SUmesh Unnikrishnan         if (use_collograd_parallelization) {
3536ca0f394SUmesh Unnikrishnan           h_G.outputs[i] = basis_impl->d_collo_grad_1d;
3546ca0f394SUmesh Unnikrishnan           code << "  local CeedScalar s_G_out_" << i << "[" << Q_1d * Q_1d << "];\n";
3556ca0f394SUmesh Unnikrishnan           code << "  loadMatrix(Q_1D*Q_1D, G->outputs[" << i << "], s_G_out_" << i << ");\n";
3566ca0f394SUmesh Unnikrishnan         } else {
3571c66c397SJeremy L Thompson           bool has_collo_grad = basis_impl->d_collo_grad_1d;
3586ca0f394SUmesh Unnikrishnan           h_G.outputs[i]      = has_collo_grad ? basis_impl->d_collo_grad_1d : basis_impl->d_grad_1d;
3596ca0f394SUmesh Unnikrishnan           code << "  local CeedScalar s_G_out_" << i << "[" << Q_1d * (has_collo_grad ? Q_1d : P_1d) << "];\n";
3606ca0f394SUmesh Unnikrishnan           code << "  loadMatrix(" << (has_collo_grad ? "Q_1D" : ("P_out_" + std::to_string(i))) << "*Q_1D, G->outputs[" << i << "], s_G_out_" << i
3616ca0f394SUmesh Unnikrishnan                << ");\n";
3626ca0f394SUmesh Unnikrishnan         }
3636ca0f394SUmesh Unnikrishnan         break;
3646ca0f394SUmesh Unnikrishnan       // LCOV_EXCL_START
3656ca0f394SUmesh Unnikrishnan       case CEED_EVAL_WEIGHT: {
3666ca0f394SUmesh Unnikrishnan         Ceed ceed;
3676ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
3686ca0f394SUmesh Unnikrishnan         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
3696ca0f394SUmesh Unnikrishnan         break;  // Should not occur
3706ca0f394SUmesh Unnikrishnan       }
3716ca0f394SUmesh Unnikrishnan       case CEED_EVAL_DIV:
3726ca0f394SUmesh Unnikrishnan         break;  // TODO: Not implemented
3736ca0f394SUmesh Unnikrishnan       case CEED_EVAL_CURL:
3746ca0f394SUmesh Unnikrishnan         break;  // TODO: Not implemented
3756ca0f394SUmesh Unnikrishnan                 // LCOV_EXCL_STOP
3766ca0f394SUmesh Unnikrishnan     }
3776ca0f394SUmesh Unnikrishnan   }
3786ca0f394SUmesh Unnikrishnan   code << "\n  // -- Element loop --\n";
3796ca0f394SUmesh Unnikrishnan   code << "  work_group_barrier(CLK_LOCAL_MEM_FENCE);\n";
3806ca0f394SUmesh Unnikrishnan   code << "  {\n";
3816ca0f394SUmesh Unnikrishnan   // Input basis apply if needed
3826ca0f394SUmesh Unnikrishnan   // Generate the correct eval mode code for each input
3836ca0f394SUmesh Unnikrishnan   code << "    // -- Input field restrictions and basis actions --\n";
3846ca0f394SUmesh Unnikrishnan   for (CeedInt i = 0; i < num_input_fields; i++) {
3856ca0f394SUmesh Unnikrishnan     code << "    // ---- Input field " << i << " ----\n";
3866ca0f394SUmesh Unnikrishnan     // Get elem_size, eval_mode, num_comp
3876ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict));
3886ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size));
3896ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
3906ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp));
3916ca0f394SUmesh Unnikrishnan 
3926ca0f394SUmesh Unnikrishnan     // Restriction
3936ca0f394SUmesh Unnikrishnan     if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_collograd_parallelization)) {
3946ca0f394SUmesh Unnikrishnan       code << "    CeedScalar r_u_" << i << "[num_comp_in_" << i << "*P_in_" << i << "];\n";
3956ca0f394SUmesh Unnikrishnan 
3966ca0f394SUmesh Unnikrishnan       bool is_strided;
3976ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided));
3986ca0f394SUmesh Unnikrishnan       if (!is_strided) {
3996ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize));
4006ca0f394SUmesh Unnikrishnan         code << "    const CeedInt lsize_in_" << i << " = " << lsize << ";\n";
4016ca0f394SUmesh Unnikrishnan         CeedInt comp_stride;
4026ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride));
4036ca0f394SUmesh Unnikrishnan         code << "    // CompStride: " << comp_stride << "\n";
4046ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl));
4056ca0f394SUmesh Unnikrishnan         h_indices.inputs[i] = restr_impl->d_ind;
4066ca0f394SUmesh Unnikrishnan         code << "    readDofsOffset" << dim << "d(num_comp_in_" << i << ", " << comp_stride << ", P_in_" << i << ", num_elem, indices->inputs[" << i
4076ca0f394SUmesh Unnikrishnan              << "], d_u_" << i << ", r_u_" << i << ");\n";
4086ca0f394SUmesh Unnikrishnan       } else {
4096ca0f394SUmesh Unnikrishnan         bool has_backend_strides;
4106ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides));
4116ca0f394SUmesh Unnikrishnan         CeedInt num_elem;
4126ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem));
4136ca0f394SUmesh Unnikrishnan         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
4146ca0f394SUmesh Unnikrishnan         if (!has_backend_strides) {
4156ca0f394SUmesh Unnikrishnan           CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides));
4166ca0f394SUmesh Unnikrishnan         }
4176ca0f394SUmesh Unnikrishnan         code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
4186ca0f394SUmesh Unnikrishnan         code << "    readDofsStrided" << dim << "d(num_comp_in_" << i << ",P_in_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
4196ca0f394SUmesh Unnikrishnan              << ", num_elem, d_u_" << i << ", r_u_" << i << ");\n";
4206ca0f394SUmesh Unnikrishnan       }
4216ca0f394SUmesh Unnikrishnan     }
4226ca0f394SUmesh Unnikrishnan 
4236ca0f394SUmesh Unnikrishnan     // Basis action
4246ca0f394SUmesh Unnikrishnan     code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
4256ca0f394SUmesh Unnikrishnan     switch (eval_mode) {
4266ca0f394SUmesh Unnikrishnan       case CEED_EVAL_NONE:
4276ca0f394SUmesh Unnikrishnan         if (!use_collograd_parallelization) {
4286ca0f394SUmesh Unnikrishnan           code << "    private CeedScalar* r_t_" << i << " = r_u_" << i << ";\n";
4296ca0f394SUmesh Unnikrishnan         }
4306ca0f394SUmesh Unnikrishnan         break;
4316ca0f394SUmesh Unnikrishnan       case CEED_EVAL_INTERP:
4326ca0f394SUmesh Unnikrishnan         code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n";
4336ca0f394SUmesh Unnikrishnan         code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_" << i
4346ca0f394SUmesh Unnikrishnan              << ", r_t_" << i << ", elem_scratch);\n";
4356ca0f394SUmesh Unnikrishnan         break;
4366ca0f394SUmesh Unnikrishnan       case CEED_EVAL_GRAD:
4376ca0f394SUmesh Unnikrishnan         if (use_collograd_parallelization) {
4386ca0f394SUmesh Unnikrishnan           code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*Q_1D];\n";
4396ca0f394SUmesh Unnikrishnan           code << "    Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_in_" << i << ", P_in_" << i << ", Q_1D, r_u_" << i << ", s_B_in_"
4406ca0f394SUmesh Unnikrishnan                << i << ", r_t_" << i << ", elem_scratch);\n";
4416ca0f394SUmesh Unnikrishnan         } else {
4426ca0f394SUmesh Unnikrishnan           CeedInt P_1d;
4436ca0f394SUmesh Unnikrishnan           CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
4446ca0f394SUmesh Unnikrishnan           CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
4456ca0f394SUmesh Unnikrishnan           code << "    CeedScalar r_t_" << i << "[num_comp_in_" << i << "*DIM*Q_1D];\n";
4466ca0f394SUmesh Unnikrishnan           code << "    Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_in_" << i
4476ca0f394SUmesh Unnikrishnan                << ", P_in_" << i << ", Q_1D, r_u_" << i << (dim > 1 ? ", s_B_in_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_in_" << i
4486ca0f394SUmesh Unnikrishnan                << ", r_t_" << i << ", elem_scratch);\n";
4496ca0f394SUmesh Unnikrishnan         }
4506ca0f394SUmesh Unnikrishnan         break;
4516ca0f394SUmesh Unnikrishnan       case CEED_EVAL_WEIGHT:
4526ca0f394SUmesh Unnikrishnan         code << "    CeedScalar r_t_" << i << "[Q_1D];\n";
4536ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
4546ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedBasisGetData(basis, &basis_impl));
4556ca0f394SUmesh Unnikrishnan         impl->W = basis_impl->d_q_weight_1d;
4566ca0f394SUmesh Unnikrishnan         code << "    Weight" << (dim > 1 ? "Tensor" : "") << dim << "d(Q_1D, W, r_t_" << i << ");\n";
4576ca0f394SUmesh Unnikrishnan         break;  // No action
4586ca0f394SUmesh Unnikrishnan       case CEED_EVAL_DIV:
4596ca0f394SUmesh Unnikrishnan         break;  // TODO: Not implemented
4606ca0f394SUmesh Unnikrishnan       case CEED_EVAL_CURL:
4616ca0f394SUmesh Unnikrishnan         break;  // TODO: Not implemented
4626ca0f394SUmesh Unnikrishnan     }
4636ca0f394SUmesh Unnikrishnan   }
4646ca0f394SUmesh Unnikrishnan 
4656ca0f394SUmesh Unnikrishnan   // Q function
4666ca0f394SUmesh Unnikrishnan   code << "\n    // -- Output field setup --\n";
4676ca0f394SUmesh Unnikrishnan   for (CeedInt i = 0; i < num_output_fields; i++) {
4686ca0f394SUmesh Unnikrishnan     code << "\n    // ---- Output field " << i << " ----\n";
4696ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
4706ca0f394SUmesh Unnikrishnan     if (eval_mode == CEED_EVAL_GRAD) {
4716ca0f394SUmesh Unnikrishnan       if (use_collograd_parallelization) {
4726ca0f394SUmesh Unnikrishnan         // Accumulator for gradient slices
4736ca0f394SUmesh Unnikrishnan         code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n";
4746ca0f394SUmesh Unnikrishnan         code << "    for (CeedInt i = 0; i < num_comp_out_" << i << "; i++) {\n";
4756ca0f394SUmesh Unnikrishnan         code << "      for (CeedInt j = 0; j < Q_1D; ++j) {\n";
4766ca0f394SUmesh Unnikrishnan         code << "        r_tt_" << i << "[j + i*Q_1D] = 0.0;\n";
4776ca0f394SUmesh Unnikrishnan         code << "      }\n";
4786ca0f394SUmesh Unnikrishnan         code << "    }\n";
4796ca0f394SUmesh Unnikrishnan       } else {
4806ca0f394SUmesh Unnikrishnan         code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*DIM*Q_1D];\n";
4816ca0f394SUmesh Unnikrishnan       }
4826ca0f394SUmesh Unnikrishnan     }
4836ca0f394SUmesh Unnikrishnan     if (eval_mode == CEED_EVAL_NONE || eval_mode == CEED_EVAL_INTERP) {
4846ca0f394SUmesh Unnikrishnan       code << "    CeedScalar r_tt_" << i << "[num_comp_out_" << i << "*Q_1D];\n";
4856ca0f394SUmesh Unnikrishnan     }
4866ca0f394SUmesh Unnikrishnan   }
4876ca0f394SUmesh Unnikrishnan   // We treat quadrature points per slice in 3d to save registers
4886ca0f394SUmesh Unnikrishnan   if (use_collograd_parallelization) {
4896ca0f394SUmesh Unnikrishnan     code << "\n    // Note: Using planes of 3D elements\n";
4906ca0f394SUmesh Unnikrishnan     code << "    for (CeedInt q = 0; q < Q_1D; q++) {\n";
4916ca0f394SUmesh Unnikrishnan     code << "      // -- Input fields --\n";
4926ca0f394SUmesh Unnikrishnan     for (CeedInt i = 0; i < num_input_fields; i++) {
4936ca0f394SUmesh Unnikrishnan       code << "      // ---- Input field " << i << " ----\n";
4946ca0f394SUmesh Unnikrishnan       // Get elem_size, eval_mode, num_comp
4956ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
4966ca0f394SUmesh Unnikrishnan       // Basis action
4976ca0f394SUmesh Unnikrishnan       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
4986ca0f394SUmesh Unnikrishnan       switch (eval_mode) {
4996ca0f394SUmesh Unnikrishnan         case CEED_EVAL_NONE:
5006ca0f394SUmesh Unnikrishnan           code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
5016ca0f394SUmesh Unnikrishnan 
5026ca0f394SUmesh Unnikrishnan           bool is_strided;
5036ca0f394SUmesh Unnikrishnan           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &Erestrict));
5046ca0f394SUmesh Unnikrishnan           CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided));
5056ca0f394SUmesh Unnikrishnan           if (!is_strided) {
5066ca0f394SUmesh Unnikrishnan             CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize));
5076ca0f394SUmesh Unnikrishnan             code << "      const CeedInt lsize_in_" << i << " = " << lsize << ";\n";
5086ca0f394SUmesh Unnikrishnan             CeedInt comp_stride;
5096ca0f394SUmesh Unnikrishnan             CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride));
5106ca0f394SUmesh Unnikrishnan             code << "      // CompStride: " << comp_stride << "\n";
5116ca0f394SUmesh Unnikrishnan             CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl));
5126ca0f394SUmesh Unnikrishnan             h_indices.inputs[i] = restr_impl->d_ind;
5136ca0f394SUmesh Unnikrishnan             code << "      readSliceQuadsOffset"
5146ca0f394SUmesh Unnikrishnan                  << "3d(num_comp_in_" << i << ", " << comp_stride << ", Q_1D, lsize_in_" << i << ", num_elem, q, indices->inputs[" << i << "], d_u_"
5156ca0f394SUmesh Unnikrishnan                  << i << ", r_q_" << i << ");\n";
5166ca0f394SUmesh Unnikrishnan           } else {
5176ca0f394SUmesh Unnikrishnan             CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size));
5186ca0f394SUmesh Unnikrishnan             bool has_backend_strides;
5196ca0f394SUmesh Unnikrishnan             CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides));
5206ca0f394SUmesh Unnikrishnan             CeedInt num_elem;
5216ca0f394SUmesh Unnikrishnan             CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem));
5226ca0f394SUmesh Unnikrishnan             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
5236ca0f394SUmesh Unnikrishnan             if (!has_backend_strides) {
5246ca0f394SUmesh Unnikrishnan               CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides));
5256ca0f394SUmesh Unnikrishnan             }
5266ca0f394SUmesh Unnikrishnan             code << "      // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
5276ca0f394SUmesh Unnikrishnan             code << "      readSliceQuadsStrided"
5286ca0f394SUmesh Unnikrishnan                  << "3d(num_comp_in_" << i << ", Q_1D," << strides[0] << ", " << strides[1] << ", " << strides[2] << ", num_elem, q, d_u_" << i
5296ca0f394SUmesh Unnikrishnan                  << ", r_q_" << i << ");\n";
5306ca0f394SUmesh Unnikrishnan           }
5316ca0f394SUmesh Unnikrishnan           break;
5326ca0f394SUmesh Unnikrishnan         case CEED_EVAL_INTERP:
5336ca0f394SUmesh Unnikrishnan           code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "];\n";
5346ca0f394SUmesh Unnikrishnan           code << "      for (CeedInt j = 0; j < num_comp_in_" << i << " ; ++j) {\n";
5356ca0f394SUmesh Unnikrishnan           code << "        r_q_" << i << "[j] = r_t_" << i << "[q + j*Q_1D];\n";
5366ca0f394SUmesh Unnikrishnan           code << "      }\n";
5376ca0f394SUmesh Unnikrishnan           break;
5386ca0f394SUmesh Unnikrishnan         case CEED_EVAL_GRAD:
5396ca0f394SUmesh Unnikrishnan           code << "      CeedScalar r_q_" << i << "[num_comp_in_" << i << "*DIM];\n";
5406ca0f394SUmesh Unnikrishnan           code << "      gradCollo3d(num_comp_in_" << i << ", Q_1D, q, r_t_" << i << ", s_G_in_" << i << ", r_q_" << i << ", elem_scratch);\n";
5416ca0f394SUmesh Unnikrishnan           break;
5426ca0f394SUmesh Unnikrishnan         case CEED_EVAL_WEIGHT:
5436ca0f394SUmesh Unnikrishnan           code << "      CeedScalar r_q_" << i << "[1];\n";
5446ca0f394SUmesh Unnikrishnan           code << "      r_q_" << i << "[0] = r_t_" << i << "[q];\n";
5456ca0f394SUmesh Unnikrishnan           break;  // No action
5466ca0f394SUmesh Unnikrishnan         case CEED_EVAL_DIV:
5476ca0f394SUmesh Unnikrishnan           break;  // TODO: Not implemented
5486ca0f394SUmesh Unnikrishnan         case CEED_EVAL_CURL:
5496ca0f394SUmesh Unnikrishnan           break;  // TODO: Not implemented
5506ca0f394SUmesh Unnikrishnan       }
5516ca0f394SUmesh Unnikrishnan     }
5526ca0f394SUmesh Unnikrishnan     code << "\n      // -- Output fields --\n";
5536ca0f394SUmesh Unnikrishnan     for (CeedInt i = 0; i < num_output_fields; i++) {
5546ca0f394SUmesh Unnikrishnan       code << "      // ---- Output field " << i << " ----\n";
5556ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
5566ca0f394SUmesh Unnikrishnan       // Basis action
5576ca0f394SUmesh Unnikrishnan       switch (eval_mode) {
5586ca0f394SUmesh Unnikrishnan         case CEED_EVAL_NONE:
5596ca0f394SUmesh Unnikrishnan           code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
5606ca0f394SUmesh Unnikrishnan           break;  // No action
5616ca0f394SUmesh Unnikrishnan         case CEED_EVAL_INTERP:
5626ca0f394SUmesh Unnikrishnan           code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "];\n";
5636ca0f394SUmesh Unnikrishnan           break;
5646ca0f394SUmesh Unnikrishnan         case CEED_EVAL_GRAD:
5656ca0f394SUmesh Unnikrishnan           code << "      CeedScalar r_qq_" << i << "[num_comp_out_" << i << "*DIM];\n";
5666ca0f394SUmesh Unnikrishnan           break;
5676ca0f394SUmesh Unnikrishnan         case CEED_EVAL_WEIGHT:
5686ca0f394SUmesh Unnikrishnan           break;  // Should not occur
5696ca0f394SUmesh Unnikrishnan         case CEED_EVAL_DIV:
5706ca0f394SUmesh Unnikrishnan           break;  // TODO: Not implemented
5716ca0f394SUmesh Unnikrishnan         case CEED_EVAL_CURL:
5726ca0f394SUmesh Unnikrishnan           break;  // TODO: Not implemented
5736ca0f394SUmesh Unnikrishnan       }
5746ca0f394SUmesh Unnikrishnan     }
5756ca0f394SUmesh Unnikrishnan   } else {
5766ca0f394SUmesh Unnikrishnan     code << "\n      // Note: Using full elements\n";
5776ca0f394SUmesh Unnikrishnan     code << "      // -- Input fields --\n";
5786ca0f394SUmesh Unnikrishnan     for (CeedInt i = 0; i < num_input_fields; i++) {
5796ca0f394SUmesh Unnikrishnan       code << "      // ---- Input field " << i << " ----\n";
5806ca0f394SUmesh Unnikrishnan       code << "      private CeedScalar* r_q_" << i << " = r_t_" << i << ";\n";
5816ca0f394SUmesh Unnikrishnan     }
5826ca0f394SUmesh Unnikrishnan     code << "      // -- Output fields --\n";
5836ca0f394SUmesh Unnikrishnan     for (CeedInt i = 0; i < num_output_fields; i++) {
5846ca0f394SUmesh Unnikrishnan       code << "      // ---- Output field " << i << " ----\n";
5856ca0f394SUmesh Unnikrishnan       code << "      private CeedScalar* r_qq_" << i << " = r_tt_" << i << ";\n";
5866ca0f394SUmesh Unnikrishnan     }
5876ca0f394SUmesh Unnikrishnan   }
5886ca0f394SUmesh Unnikrishnan   //--------------------------------------------------
5896ca0f394SUmesh Unnikrishnan   code << "\n      // -- QFunction Inputs and outputs --\n";
5906ca0f394SUmesh Unnikrishnan   code << "      const CeedScalar * in[" << num_input_fields << "];\n";
5916ca0f394SUmesh Unnikrishnan   for (CeedInt i = 0; i < num_input_fields; i++) {
5926ca0f394SUmesh Unnikrishnan     code << "      // ---- Input field " << i << " ----\n";
5936ca0f394SUmesh Unnikrishnan     code << "      in[" << i << "] = r_q_" << i << ";\n";
5946ca0f394SUmesh Unnikrishnan   }
5956ca0f394SUmesh Unnikrishnan   code << "      CeedScalar * out[" << num_output_fields << "];\n";
5966ca0f394SUmesh Unnikrishnan   for (CeedInt i = 0; i < num_output_fields; i++) {
5976ca0f394SUmesh Unnikrishnan     code << "      // ---- Output field " << i << " ----\n";
5986ca0f394SUmesh Unnikrishnan     code << "      out[" << i << "] = r_qq_" << i << ";\n";
5996ca0f394SUmesh Unnikrishnan   }
6006ca0f394SUmesh Unnikrishnan 
6016ca0f394SUmesh Unnikrishnan   code << "\n      // -- Apply QFunction --\n";
6026ca0f394SUmesh Unnikrishnan   code << "      " << q_function_name << "(ctx, ";
6036ca0f394SUmesh Unnikrishnan   if (dim != 3 || use_collograd_parallelization) {
6046ca0f394SUmesh Unnikrishnan     code << "1";
6056ca0f394SUmesh Unnikrishnan   } else {
6066ca0f394SUmesh Unnikrishnan     code << "Q_1D";
6076ca0f394SUmesh Unnikrishnan   }
6086ca0f394SUmesh Unnikrishnan   code << ", in, out);\n";
6096ca0f394SUmesh Unnikrishnan   //--------------------------------------------------
6106ca0f394SUmesh Unnikrishnan 
6116ca0f394SUmesh Unnikrishnan   if (use_collograd_parallelization) {
6126ca0f394SUmesh Unnikrishnan     code << "      // -- Output fields --\n";
6136ca0f394SUmesh Unnikrishnan     for (CeedInt i = 0; i < num_output_fields; i++) {
6146ca0f394SUmesh Unnikrishnan       code << "      // ---- Output field " << i << " ----\n";
6156ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
6166ca0f394SUmesh Unnikrishnan       // Basis action
6176ca0f394SUmesh Unnikrishnan       code << "      // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
6186ca0f394SUmesh Unnikrishnan       switch (eval_mode) {
6196ca0f394SUmesh Unnikrishnan         case CEED_EVAL_NONE:
6206ca0f394SUmesh Unnikrishnan           code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
6216ca0f394SUmesh Unnikrishnan           code << "        r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n";
6226ca0f394SUmesh Unnikrishnan           code << "      }\n";
6236ca0f394SUmesh Unnikrishnan           break;  // No action
6246ca0f394SUmesh Unnikrishnan         case CEED_EVAL_INTERP:
6256ca0f394SUmesh Unnikrishnan           code << "      for (CeedInt j = 0; j < num_comp_out_" << i << " ; ++j) {\n";
6266ca0f394SUmesh Unnikrishnan           code << "        r_tt_" << i << "[q + j*Q_1D] = r_qq_" << i << "[j];\n";
6276ca0f394SUmesh Unnikrishnan           code << "      }\n";
6286ca0f394SUmesh Unnikrishnan           break;
6296ca0f394SUmesh Unnikrishnan         case CEED_EVAL_GRAD:
6306ca0f394SUmesh Unnikrishnan           code << "      gradColloTranspose3d(num_comp_out_" << i << ",Q_1D, q, r_qq_" << i << ", s_G_out_" << i << ", r_tt_" << i
6316ca0f394SUmesh Unnikrishnan                << ", elem_scratch);\n";
6326ca0f394SUmesh Unnikrishnan           break;
6336ca0f394SUmesh Unnikrishnan         case CEED_EVAL_WEIGHT:
6346ca0f394SUmesh Unnikrishnan           break;  // Should not occur
6356ca0f394SUmesh Unnikrishnan         case CEED_EVAL_DIV:
6366ca0f394SUmesh Unnikrishnan           break;  // TODO: Not implemented
6376ca0f394SUmesh Unnikrishnan         case CEED_EVAL_CURL:
6386ca0f394SUmesh Unnikrishnan           break;  // TODO: Not implemented
6396ca0f394SUmesh Unnikrishnan       }
6406ca0f394SUmesh Unnikrishnan     }
6416ca0f394SUmesh Unnikrishnan     code << "    }\n";
6426ca0f394SUmesh Unnikrishnan   }
6436ca0f394SUmesh Unnikrishnan 
6446ca0f394SUmesh Unnikrishnan   // Output basis apply if needed
6456ca0f394SUmesh Unnikrishnan   // Generate the correct eval mode code for each output
6466ca0f394SUmesh Unnikrishnan   code << "\n    // -- Output field basis action and restrictions --\n";
6476ca0f394SUmesh Unnikrishnan   for (CeedInt i = 0; i < num_output_fields; i++) {
6486ca0f394SUmesh Unnikrishnan     code << "    // ---- Output field " << i << " ----\n";
6496ca0f394SUmesh Unnikrishnan     // Get elem_size, eval_mode, num_comp
6506ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &Erestrict));
6516ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedElemRestrictionGetElementSize(Erestrict, &elem_size));
6526ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
6536ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedElemRestrictionGetNumComponents(Erestrict, &num_comp));
6546ca0f394SUmesh Unnikrishnan     // Basis action
6556ca0f394SUmesh Unnikrishnan     code << "    // EvalMode: " << CeedEvalModes[eval_mode] << "\n";
6566ca0f394SUmesh Unnikrishnan     switch (eval_mode) {
6576ca0f394SUmesh Unnikrishnan       case CEED_EVAL_NONE:
6586ca0f394SUmesh Unnikrishnan         code << "    private CeedScalar* r_v_" << i << " = r_tt_" << i << ";\n";
6596ca0f394SUmesh Unnikrishnan         break;  // No action
6606ca0f394SUmesh Unnikrishnan       case CEED_EVAL_INTERP:
6616ca0f394SUmesh Unnikrishnan         code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
6626ca0f394SUmesh Unnikrishnan         code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i
6636ca0f394SUmesh Unnikrishnan              << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n";
6646ca0f394SUmesh Unnikrishnan         break;
6656ca0f394SUmesh Unnikrishnan       case CEED_EVAL_GRAD:
6666ca0f394SUmesh Unnikrishnan         code << "    CeedScalar r_v_" << i << "[num_comp_out_" << i << "*P_out_" << i << "];\n";
6676ca0f394SUmesh Unnikrishnan         if (use_collograd_parallelization) {
6686ca0f394SUmesh Unnikrishnan           code << "    InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d(num_comp_out_" << i << ",P_out_" << i << ", Q_1D, r_tt_" << i
6696ca0f394SUmesh Unnikrishnan                << ", s_B_out_" << i << ", r_v_" << i << ", elem_scratch);\n";
6706ca0f394SUmesh Unnikrishnan         } else {
6716ca0f394SUmesh Unnikrishnan           CeedInt P_1d;
6726ca0f394SUmesh Unnikrishnan           CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
6736ca0f394SUmesh Unnikrishnan           CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
6746ca0f394SUmesh Unnikrishnan           code << "    GradTranspose" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(num_comp_out_" << i
6756ca0f394SUmesh Unnikrishnan                << ", P_out_" << i << ", Q_1D, r_tt_" << i << (dim > 1 ? ", s_B_out_" : "") << (dim > 1 ? std::to_string(i) : "") << ", s_G_out_" << i
6766ca0f394SUmesh Unnikrishnan                << ", r_v_" << i << ", elem_scratch);\n";
6776ca0f394SUmesh Unnikrishnan         }
6786ca0f394SUmesh Unnikrishnan         break;
6796ca0f394SUmesh Unnikrishnan       // LCOV_EXCL_START
6806ca0f394SUmesh Unnikrishnan       case CEED_EVAL_WEIGHT: {
6816ca0f394SUmesh Unnikrishnan         Ceed ceed;
6826ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
6836ca0f394SUmesh Unnikrishnan         return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
6846ca0f394SUmesh Unnikrishnan         break;  // Should not occur
6856ca0f394SUmesh Unnikrishnan       }
6866ca0f394SUmesh Unnikrishnan       case CEED_EVAL_DIV:
6876ca0f394SUmesh Unnikrishnan         break;  // TODO: Not implemented
6886ca0f394SUmesh Unnikrishnan       case CEED_EVAL_CURL:
6896ca0f394SUmesh Unnikrishnan         break;  // TODO: Not implemented
6906ca0f394SUmesh Unnikrishnan                 // LCOV_EXCL_STOP
6916ca0f394SUmesh Unnikrishnan     }
6926ca0f394SUmesh Unnikrishnan     // Restriction
6936ca0f394SUmesh Unnikrishnan     bool is_strided;
6946ca0f394SUmesh Unnikrishnan     CeedCallBackend(CeedElemRestrictionIsStrided(Erestrict, &is_strided));
6956ca0f394SUmesh Unnikrishnan     if (!is_strided) {
6966ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedElemRestrictionGetLVectorSize(Erestrict, &lsize));
6976ca0f394SUmesh Unnikrishnan       code << "    const CeedInt lsize_out_" << i << " = " << lsize << ";\n";
6986ca0f394SUmesh Unnikrishnan       CeedInt comp_stride;
6996ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedElemRestrictionGetCompStride(Erestrict, &comp_stride));
7006ca0f394SUmesh Unnikrishnan       code << "    // CompStride: " << comp_stride << "\n";
7016ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedElemRestrictionGetData(Erestrict, &restr_impl));
7026ca0f394SUmesh Unnikrishnan       h_indices.outputs[i] = restr_impl->d_ind;
7036ca0f394SUmesh Unnikrishnan       code << "    writeDofsOffset" << dim << "d(num_comp_out_" << i << ", " << comp_stride << ", P_out_" << i << ", num_elem, indices->outputs[" << i
7046ca0f394SUmesh Unnikrishnan            << "], r_v_" << i << ", d_v_" << i << ");\n";
7056ca0f394SUmesh Unnikrishnan     } else {
7066ca0f394SUmesh Unnikrishnan       bool has_backend_strides;
7076ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedElemRestrictionHasBackendStrides(Erestrict, &has_backend_strides));
7086ca0f394SUmesh Unnikrishnan       CeedInt num_elem;
7096ca0f394SUmesh Unnikrishnan       CeedCallBackend(CeedElemRestrictionGetNumElements(Erestrict, &num_elem));
7106ca0f394SUmesh Unnikrishnan       CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
7116ca0f394SUmesh Unnikrishnan       if (!has_backend_strides) {
7126ca0f394SUmesh Unnikrishnan         CeedCallBackend(CeedElemRestrictionGetStrides(Erestrict, &strides));
7136ca0f394SUmesh Unnikrishnan       }
7146ca0f394SUmesh Unnikrishnan       code << "    // Strides: {" << strides[0] << ", " << strides[1] << ", " << strides[2] << "}\n";
7156ca0f394SUmesh Unnikrishnan       code << "    writeDofsStrided" << dim << "d(num_comp_out_" << i << ",P_out_" << i << "," << strides[0] << "," << strides[1] << "," << strides[2]
7166ca0f394SUmesh Unnikrishnan            << ", num_elem, r_v_" << i << ", d_v_" << i << ");\n";
7176ca0f394SUmesh Unnikrishnan     }
7186ca0f394SUmesh Unnikrishnan   }
7196ca0f394SUmesh Unnikrishnan 
7206ca0f394SUmesh Unnikrishnan   code << "  }\n";
7216ca0f394SUmesh Unnikrishnan   code << "}\n";
7226ca0f394SUmesh Unnikrishnan   code << "// -----------------------------------------------------------------------------\n\n";
7236ca0f394SUmesh Unnikrishnan 
7246ca0f394SUmesh Unnikrishnan   // Copy the struct (containing device addresses) from the host to the device
7256ca0f394SUmesh Unnikrishnan   sycl::event copy_B       = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_B, impl->B, 1);
7266ca0f394SUmesh Unnikrishnan   sycl::event copy_G       = sycl_data->sycl_queue.copy<Fields_Sycl>(&h_G, impl->G, 1);
7276ca0f394SUmesh Unnikrishnan   sycl::event copy_indices = sycl_data->sycl_queue.copy<FieldsInt_Sycl>(&h_indices, impl->indices, 1);
7286ca0f394SUmesh Unnikrishnan   // These copies can happen while the JIT is being done
7296ca0f394SUmesh Unnikrishnan   CeedCallSycl(ceed, sycl::event::wait_and_throw({copy_B, copy_G, copy_indices}));
7306ca0f394SUmesh Unnikrishnan 
7316ca0f394SUmesh Unnikrishnan   // View kernel for debugging
7326ca0f394SUmesh Unnikrishnan   CeedDebug256(ceed, 2, "Generated Operator Kernels:\n");
7336ca0f394SUmesh Unnikrishnan   CeedDebug(ceed, code.str().c_str());
7346ca0f394SUmesh Unnikrishnan 
7356ca0f394SUmesh Unnikrishnan   std::map<std::string, CeedInt> jit_constants;
7366ca0f394SUmesh Unnikrishnan   jit_constants["T_1D"]         = block_sizes[0];
7376ca0f394SUmesh Unnikrishnan   jit_constants["GROUP_SIZE_X"] = block_sizes[0];
7386ca0f394SUmesh Unnikrishnan   jit_constants["GROUP_SIZE_Y"] = block_sizes[1];
7396ca0f394SUmesh Unnikrishnan   jit_constants["GROUP_SIZE_Z"] = block_sizes[2];
7406ca0f394SUmesh Unnikrishnan 
7416ca0f394SUmesh Unnikrishnan   // Compile kernel into a kernel bundle
7426ca0f394SUmesh Unnikrishnan   CeedCallBackend(CeedBuildModule_Sycl(ceed, code.str(), &impl->sycl_module, jit_constants));
7436ca0f394SUmesh Unnikrishnan 
7446ca0f394SUmesh Unnikrishnan   // Load kernel function
7456ca0f394SUmesh Unnikrishnan   CeedCallBackend(CeedGetKernel_Sycl(ceed, impl->sycl_module, operator_name, &impl->op));
7466ca0f394SUmesh Unnikrishnan 
7476ca0f394SUmesh Unnikrishnan   CeedCallBackend(CeedOperatorSetSetupDone(op));
7486ca0f394SUmesh Unnikrishnan   return CEED_ERROR_SUCCESS;
7496ca0f394SUmesh Unnikrishnan }
7506ca0f394SUmesh Unnikrishnan 
7516ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
752