xref: /libCEED/rust/libceed-sys/c-src/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 5daefc96c3d6c1b0bbb656215f0640792d88e993)
1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
37d8d0e25Snbeams //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
57d8d0e25Snbeams //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
73d576824SJeremy L Thompson 
87d8d0e25Snbeams #define CEED_DEBUG_COLOR 12
97d8d0e25Snbeams 
1049aac155SJeremy L Thompson #include <ceed.h>
11ec3da8bcSJed Brown #include <ceed/backend.h>
120183ed61SJeremy L Thompson #include <ceed/gen-tools.h>
139e201c85SYohann #include <ceed/jit-tools.h>
142b730f8bSJeremy L Thompson 
157d8d0e25Snbeams #include <iostream>
167d8d0e25Snbeams #include <sstream>
172b730f8bSJeremy L Thompson #include <string>
182b730f8bSJeremy L Thompson 
190d0321e0SJeremy L Thompson #include "../hip-ref/ceed-hip-ref.h"
207d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h"
21b2165e7aSSebastian Grimberg #include "../hip/ceed-hip-common.h"
227d8d0e25Snbeams #include "../hip/ceed-hip-compile.h"
232b730f8bSJeremy L Thompson #include "ceed-hip-gen.h"
24b3e1519bSnbeams 
2545a787f7SJeremy L Thompson struct FieldReuse_Hip {
2645a787f7SJeremy L Thompson   CeedInt      index;
2745a787f7SJeremy L Thompson   bool         is_input;
2845a787f7SJeremy L Thompson   CeedEvalMode eval_mode;
2945a787f7SJeremy L Thompson };
3045a787f7SJeremy L Thompson 
31b3e1519bSnbeams //------------------------------------------------------------------------------
32b3e1519bSnbeams // Calculate the block size used for launching the operator kernel
33b3e1519bSnbeams //------------------------------------------------------------------------------
342b730f8bSJeremy L Thompson extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
353a2968d6SJeremy L Thompson   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
36b3e1519bSnbeams   if (dim == 1) {
373a2968d6SJeremy L Thompson     CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
38b7453713SJeremy L Thompson 
399e201c85SYohann     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
403a2968d6SJeremy L Thompson     block_sizes[0]  = thread_1d;
41b3e1519bSnbeams     block_sizes[1]  = 1;
429e201c85SYohann     block_sizes[2]  = elems_per_block;
43b3e1519bSnbeams   } else if (dim == 2) {
443a2968d6SJeremy L Thompson     const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2;
45b7453713SJeremy L Thompson 
463a2968d6SJeremy L Thompson     block_sizes[0] = thread_1d;
473a2968d6SJeremy L Thompson     block_sizes[1] = thread_1d;
489e201c85SYohann     block_sizes[2] = elems_per_block;
49b3e1519bSnbeams   } else if (dim == 3) {
503a2968d6SJeremy L Thompson     const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1);
51b7453713SJeremy L Thompson 
523a2968d6SJeremy L Thompson     block_sizes[0] = thread_1d;
533a2968d6SJeremy L Thompson     block_sizes[1] = thread_1d;
549e201c85SYohann     block_sizes[2] = elems_per_block;
55b3e1519bSnbeams   }
56b3e1519bSnbeams   return CEED_ERROR_SUCCESS;
57b3e1519bSnbeams }
58b3e1519bSnbeams 
597d8d0e25Snbeams //------------------------------------------------------------------------------
604b3e95d5SJeremy L Thompson // Determine type of operator
614b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
624b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
634b3e95d5SJeremy L Thompson                                                CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
6474398b5aSJeremy L Thompson                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
6574398b5aSJeremy L Thompson                                                CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
6674398b5aSJeremy L Thompson   // Check if all are tensor
6774398b5aSJeremy L Thompson   *is_all_tensor = true;
684b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
694b3e95d5SJeremy L Thompson     CeedBasis basis;
704b3e95d5SJeremy L Thompson 
714b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
724b3e95d5SJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
734b3e95d5SJeremy L Thompson       bool is_field_tensor;
744b3e95d5SJeremy L Thompson 
754b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
7674398b5aSJeremy L Thompson       *is_all_tensor = *is_all_tensor && is_field_tensor;
774b3e95d5SJeremy L Thompson     }
783a2968d6SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis));
794b3e95d5SJeremy L Thompson   }
804b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
814b3e95d5SJeremy L Thompson     CeedBasis basis;
824b3e95d5SJeremy L Thompson 
834b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
844b3e95d5SJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
854b3e95d5SJeremy L Thompson       bool is_field_tensor;
864b3e95d5SJeremy L Thompson 
874b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
8874398b5aSJeremy L Thompson       *is_all_tensor = *is_all_tensor && is_field_tensor;
8974398b5aSJeremy L Thompson     }
9074398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis));
9174398b5aSJeremy L Thompson   }
9274398b5aSJeremy L Thompson 
9374398b5aSJeremy L Thompson   // Find max_P, max_P_1d, Q, and Q_1d
9474398b5aSJeremy L Thompson   bool is_all_3d = true;
9574398b5aSJeremy L Thompson 
9674398b5aSJeremy L Thompson   *max_P    = 0;
9774398b5aSJeremy L Thompson   *max_P_1d = 0;
9874398b5aSJeremy L Thompson   *Q        = 0;
9974398b5aSJeremy L Thompson   *Q_1d     = 0;
10074398b5aSJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
10174398b5aSJeremy L Thompson     CeedBasis basis;
10274398b5aSJeremy L Thompson 
10374398b5aSJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
10474398b5aSJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
10574398b5aSJeremy L Thompson       bool    is_field_tensor;
10674398b5aSJeremy L Thompson       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
10774398b5aSJeremy L Thompson 
10874398b5aSJeremy L Thompson       // Check if 3D
1094b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
11074398b5aSJeremy L Thompson       is_all_3d = is_all_3d && (field_dim == 3);
11174398b5aSJeremy L Thompson       *max_dim  = CeedIntMax(*max_dim, field_dim);
11274398b5aSJeremy L Thompson 
11374398b5aSJeremy L Thompson       // Collect P, P_1d, Q, and Q_1d
11474398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
11574398b5aSJeremy L Thompson       *max_P = CeedIntMax(*max_P, field_P);
11674398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
11774398b5aSJeremy L Thompson       if (is_field_tensor) {
11874398b5aSJeremy L Thompson         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
11974398b5aSJeremy L Thompson         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
12074398b5aSJeremy L Thompson       }
12174398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
12274398b5aSJeremy L Thompson       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
12374398b5aSJeremy L Thompson       *Q = field_Q;
12474398b5aSJeremy L Thompson       if (is_field_tensor) {
12574398b5aSJeremy L Thompson         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
1264b3e95d5SJeremy L Thompson         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
1274b3e95d5SJeremy L Thompson         *Q_1d = field_Q_1d;
1284b3e95d5SJeremy L Thompson       }
12974398b5aSJeremy L Thompson     }
13074398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis));
13174398b5aSJeremy L Thompson   }
13274398b5aSJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
13374398b5aSJeremy L Thompson     CeedBasis basis;
13474398b5aSJeremy L Thompson 
13574398b5aSJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
13674398b5aSJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
13774398b5aSJeremy L Thompson       bool    is_field_tensor;
13874398b5aSJeremy L Thompson       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
13974398b5aSJeremy L Thompson 
14074398b5aSJeremy L Thompson       // Check if 3D
14174398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
14274398b5aSJeremy L Thompson       is_all_3d = is_all_3d && (field_dim == 3);
14374398b5aSJeremy L Thompson       *max_dim  = CeedIntMax(*max_dim, field_dim);
14474398b5aSJeremy L Thompson 
14574398b5aSJeremy L Thompson       // Collect P, P_1d, Q, and Q_1d
14674398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
14774398b5aSJeremy L Thompson       *max_P = CeedIntMax(*max_P, field_P);
14874398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
14974398b5aSJeremy L Thompson       if (is_field_tensor) {
15074398b5aSJeremy L Thompson         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
15174398b5aSJeremy L Thompson         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
15274398b5aSJeremy L Thompson       }
15374398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
15474398b5aSJeremy L Thompson       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
15574398b5aSJeremy L Thompson       *Q = field_Q;
15674398b5aSJeremy L Thompson       if (is_field_tensor) {
15774398b5aSJeremy L Thompson         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
15874398b5aSJeremy L Thompson         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
15974398b5aSJeremy L Thompson         *Q_1d = field_Q_1d;
16074398b5aSJeremy L Thompson       }
16174398b5aSJeremy L Thompson     }
1623a2968d6SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis));
1634b3e95d5SJeremy L Thompson   }
1644b3e95d5SJeremy L Thompson 
1654b3e95d5SJeremy L Thompson   // Only use 3D collocated gradient parallelization strategy when gradient is computed
1664b3e95d5SJeremy L Thompson   *use_3d_slices = false;
16774398b5aSJeremy L Thompson   if (is_all_3d && *is_all_tensor) {
1684b3e95d5SJeremy L Thompson     bool was_grad_found = false;
1694b3e95d5SJeremy L Thompson 
1704b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
1714b3e95d5SJeremy L Thompson       CeedEvalMode eval_mode;
1724b3e95d5SJeremy L Thompson 
1734b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1744b3e95d5SJeremy L Thompson       if (eval_mode == CEED_EVAL_GRAD) {
1754b3e95d5SJeremy L Thompson         CeedBasis_Hip_shared *basis_data;
1764b3e95d5SJeremy L Thompson         CeedBasis             basis;
1774b3e95d5SJeremy L Thompson 
1784b3e95d5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1794b3e95d5SJeremy L Thompson         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
1804b3e95d5SJeremy L Thompson         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
1814b3e95d5SJeremy L Thompson         was_grad_found = true;
1823a2968d6SJeremy L Thompson         CeedCallBackend(CeedBasisDestroy(&basis));
1834b3e95d5SJeremy L Thompson       }
1844b3e95d5SJeremy L Thompson     }
1854b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
1864b3e95d5SJeremy L Thompson       CeedEvalMode eval_mode;
1874b3e95d5SJeremy L Thompson 
1884b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1894b3e95d5SJeremy L Thompson       if (eval_mode == CEED_EVAL_GRAD) {
1904b3e95d5SJeremy L Thompson         CeedBasis_Hip_shared *basis_data;
1914b3e95d5SJeremy L Thompson         CeedBasis             basis;
1924b3e95d5SJeremy L Thompson 
1934b3e95d5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1944b3e95d5SJeremy L Thompson         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
1954b3e95d5SJeremy L Thompson         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
1964b3e95d5SJeremy L Thompson         was_grad_found = true;
1973a2968d6SJeremy L Thompson         CeedCallBackend(CeedBasisDestroy(&basis));
1984b3e95d5SJeremy L Thompson       }
1994b3e95d5SJeremy L Thompson     }
2004b3e95d5SJeremy L Thompson   }
2014b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2024b3e95d5SJeremy L Thompson }
2034b3e95d5SJeremy L Thompson 
2044b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
2054b3e95d5SJeremy L Thompson // Setup fields
2064b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
2070183ed61SJeremy L Thompson static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
2080183ed61SJeremy L Thompson                                                     CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
2090183ed61SJeremy L Thompson                                                     CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
2100183ed61SJeremy L Thompson                                                     bool use_3d_slices) {
21174398b5aSJeremy L Thompson   bool      is_tensor = true;
21274398b5aSJeremy L Thompson   CeedBasis basis;
21374398b5aSJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
21474398b5aSJeremy L Thompson   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
21574398b5aSJeremy L Thompson 
21659fa3f92SJeremy L Thompson   const char           *field_name;
2174b3e95d5SJeremy L Thompson   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
2189123fb08SJeremy L Thompson   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
2194b3e95d5SJeremy L Thompson   std::string           option_name = (is_input ? "inputs" : "outputs");
2204b3e95d5SJeremy L Thompson   CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
22174398b5aSJeremy L Thompson   CeedInt               elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
2224b3e95d5SJeremy L Thompson   CeedElemRestriction   elem_rstr;
2234b3e95d5SJeremy L Thompson   CeedBasis_Hip_shared *basis_data;
2244b3e95d5SJeremy L Thompson 
2259ee499e5SJeremy L Thompson   // Field reuse info
22645a787f7SJeremy L Thompson   bool use_previous_field = field_reuse.index != -1;
2279ee499e5SJeremy L Thompson 
22859fa3f92SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
2290183ed61SJeremy L Thompson   code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
2304b3e95d5SJeremy L Thompson 
2314b3e95d5SJeremy L Thompson   // Get field data
2324b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
2334b3e95d5SJeremy L Thompson   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
2344b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
2354b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2364b3e95d5SJeremy L Thompson   }
2373a2968d6SJeremy L Thompson   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2384b3e95d5SJeremy L Thompson   if (basis != CEED_BASIS_NONE) {
2394b3e95d5SJeremy L Thompson     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
24074398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
2419123fb08SJeremy L Thompson     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
2429123fb08SJeremy L Thompson     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
2434b3e95d5SJeremy L Thompson   }
2444b3e95d5SJeremy L Thompson   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
2454b3e95d5SJeremy L Thompson 
2464b3e95d5SJeremy L Thompson   // Set field constants
2470183ed61SJeremy L Thompson   code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
24874398b5aSJeremy L Thompson   if (is_tensor && !is_all_tensor) {
24974398b5aSJeremy L Thompson     CeedInt P = 0;
25074398b5aSJeremy L Thompson 
25174398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
2520183ed61SJeremy L Thompson     code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
25374398b5aSJeremy L Thompson   }
2540183ed61SJeremy L Thompson   code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
255343e3094SJeremy L Thompson   if (eval_mode != CEED_EVAL_WEIGHT) {
2560183ed61SJeremy L Thompson     code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
2574b3e95d5SJeremy L Thompson   }
2584b3e95d5SJeremy L Thompson 
2594b3e95d5SJeremy L Thompson   // Load basis data
2600183ed61SJeremy L Thompson   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
2614b3e95d5SJeremy L Thompson   switch (eval_mode) {
2624b3e95d5SJeremy L Thompson     case CEED_EVAL_NONE:
2634b3e95d5SJeremy L Thompson       break;
2644b3e95d5SJeremy L Thompson     case CEED_EVAL_INTERP:
2653a2968d6SJeremy L Thompson       if (is_at_points) {
2663a2968d6SJeremy L Thompson         // AtPoints
2673a2968d6SJeremy L Thompson         if (!basis_data->d_chebyshev_interp_1d) {
2683a2968d6SJeremy L Thompson           CeedSize    interp_bytes;
2693a2968d6SJeremy L Thompson           CeedScalar *chebyshev_interp_1d;
2703a2968d6SJeremy L Thompson 
2713a2968d6SJeremy L Thompson           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
2723a2968d6SJeremy L Thompson           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
2733a2968d6SJeremy L Thompson           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
2743a2968d6SJeremy L Thompson           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
2753a2968d6SJeremy L Thompson           CeedCallHip(CeedBasisReturnCeed(basis),
2763a2968d6SJeremy L Thompson                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
2773a2968d6SJeremy L Thompson           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
2783a2968d6SJeremy L Thompson         }
2793a2968d6SJeremy L Thompson         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
2803a2968d6SJeremy L Thompson         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
2813a2968d6SJeremy L Thompson       } else {
2823a2968d6SJeremy L Thompson         // Standard quadrature
2834b3e95d5SJeremy L Thompson         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
2844b3e95d5SJeremy L Thompson         else data->B.outputs[i] = basis_data->d_interp_1d;
2853a2968d6SJeremy L Thompson       }
2869ee499e5SJeremy L Thompson       if (use_previous_field) {
28745a787f7SJeremy L Thompson         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
2889ee499e5SJeremy L Thompson 
2890183ed61SJeremy L Thompson         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
2909ee499e5SJeremy L Thompson       } else {
2910183ed61SJeremy L Thompson         code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
2920183ed61SJeremy L Thompson         code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
2939ee499e5SJeremy L Thompson       }
2944b3e95d5SJeremy L Thompson       break;
2954b3e95d5SJeremy L Thompson     case CEED_EVAL_GRAD:
2963a2968d6SJeremy L Thompson       if (is_at_points) {
2973a2968d6SJeremy L Thompson         // AtPoints
2983a2968d6SJeremy L Thompson         if (!basis_data->d_chebyshev_interp_1d) {
2993a2968d6SJeremy L Thompson           CeedSize    interp_bytes;
3003a2968d6SJeremy L Thompson           CeedScalar *chebyshev_interp_1d;
3013a2968d6SJeremy L Thompson 
3023a2968d6SJeremy L Thompson           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
3033a2968d6SJeremy L Thompson           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
3043a2968d6SJeremy L Thompson           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
3053a2968d6SJeremy L Thompson           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
3063a2968d6SJeremy L Thompson           CeedCallHip(CeedBasisReturnCeed(basis),
3073a2968d6SJeremy L Thompson                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
3083a2968d6SJeremy L Thompson           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
3093a2968d6SJeremy L Thompson         }
3103a2968d6SJeremy L Thompson         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
3113a2968d6SJeremy L Thompson         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
3123a2968d6SJeremy L Thompson       } else {
3133a2968d6SJeremy L Thompson         // Standard quadrature
3144b3e95d5SJeremy L Thompson         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
3154b3e95d5SJeremy L Thompson         else data->B.outputs[i] = basis_data->d_interp_1d;
3163a2968d6SJeremy L Thompson       }
3179123fb08SJeremy L Thompson       if (is_tensor) {
3189ee499e5SJeremy L Thompson         if (use_previous_field) {
31945a787f7SJeremy L Thompson           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
3209ee499e5SJeremy L Thompson 
3210183ed61SJeremy L Thompson           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
3229ee499e5SJeremy L Thompson         } else {
3230183ed61SJeremy L Thompson           code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
3240183ed61SJeremy L Thompson           code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
3259123fb08SJeremy L Thompson         }
3269ee499e5SJeremy L Thompson       }
3273a2968d6SJeremy L Thompson       if (is_at_points) break;  // No G mat for AtPoints
3284b3e95d5SJeremy L Thompson       if (use_3d_slices) {
3294b3e95d5SJeremy L Thompson         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
3304b3e95d5SJeremy L Thompson         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
33145a787f7SJeremy L Thompson         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
33245a787f7SJeremy L Thompson           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
3339ee499e5SJeremy L Thompson 
3340183ed61SJeremy L Thompson           code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
3359ee499e5SJeremy L Thompson         } else {
3360183ed61SJeremy L Thompson           code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
3370183ed61SJeremy L Thompson           code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
3389ee499e5SJeremy L Thompson         }
3394b3e95d5SJeremy L Thompson       } else {
3404b3e95d5SJeremy L Thompson         bool has_collo_grad = basis_data->d_collo_grad_1d;
3414b3e95d5SJeremy L Thompson 
3424b3e95d5SJeremy L Thompson         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
3434b3e95d5SJeremy L Thompson         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
3444b3e95d5SJeremy L Thompson         if (has_collo_grad) {
34545a787f7SJeremy L Thompson           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
34645a787f7SJeremy L Thompson             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
3479ee499e5SJeremy L Thompson 
3480183ed61SJeremy L Thompson             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
3499ee499e5SJeremy L Thompson           } else {
3500183ed61SJeremy L Thompson             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
3510183ed61SJeremy L Thompson             code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
3529ee499e5SJeremy L Thompson           }
3539ee499e5SJeremy L Thompson         } else {
35445a787f7SJeremy L Thompson           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
35545a787f7SJeremy L Thompson             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
3569ee499e5SJeremy L Thompson 
3570183ed61SJeremy L Thompson             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
3584b3e95d5SJeremy L Thompson           } else {
3590183ed61SJeremy L Thompson             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
36074398b5aSJeremy L Thompson                  << (is_tensor ? "" : var_suffix) << "];\n";
3610183ed61SJeremy L Thompson             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
36274398b5aSJeremy L Thompson                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
3634b3e95d5SJeremy L Thompson           }
3644b3e95d5SJeremy L Thompson         }
3659ee499e5SJeremy L Thompson       }
3664b3e95d5SJeremy L Thompson       break;
3674b3e95d5SJeremy L Thompson     case CEED_EVAL_WEIGHT:
3684b3e95d5SJeremy L Thompson       break;  // No action
3694b3e95d5SJeremy L Thompson       // LCOV_EXCL_START
3704b3e95d5SJeremy L Thompson     case CEED_EVAL_DIV:
3714b3e95d5SJeremy L Thompson     case CEED_EVAL_CURL:
3724b3e95d5SJeremy L Thompson       break;  // TODO: Not implemented
3734b3e95d5SJeremy L Thompson               // LCOV_EXCL_STOP
3744b3e95d5SJeremy L Thompson   }
3753a2968d6SJeremy L Thompson   CeedCallBackend(CeedBasisDestroy(&basis));
3764b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
3774b3e95d5SJeremy L Thompson }
3784b3e95d5SJeremy L Thompson 
3794b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
3804b3e95d5SJeremy L Thompson // Restriction
3814b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
3820183ed61SJeremy L Thompson static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
3830183ed61SJeremy L Thompson                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
3840183ed61SJeremy L Thompson                                                       CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
3850183ed61SJeremy L Thompson                                                       bool use_3d_slices) {
3864b3e95d5SJeremy L Thompson   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
38774398b5aSJeremy L Thompson   std::string              P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
3884b3e95d5SJeremy L Thompson   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
38974398b5aSJeremy L Thompson   CeedInt                  elem_size = 0, num_comp = 0;
3904b3e95d5SJeremy L Thompson   CeedSize                 l_size;
391f815fac9SJeremy L Thompson   CeedRestrictionType      rstr_type = CEED_RESTRICTION_STANDARD;
3924b3e95d5SJeremy L Thompson   CeedElemRestriction_Hip *rstr_data;
3934b3e95d5SJeremy L Thompson   CeedElemRestriction      elem_rstr;
3944b3e95d5SJeremy L Thompson 
3954b3e95d5SJeremy L Thompson   // Get field data
3964b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
3974b3e95d5SJeremy L Thompson   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
398f815fac9SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
3994b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
4004b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
4014b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
4024b3e95d5SJeremy L Thompson   }
4034b3e95d5SJeremy L Thompson   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
4044b3e95d5SJeremy L Thompson 
4054b3e95d5SJeremy L Thompson   // Restriction
4064b3e95d5SJeremy L Thompson   if (is_input) {
4074b3e95d5SJeremy L Thompson     // Input
408e93651e5SJeremy L Thompson     if (field_input_buffer[i] != i) {
409e93651e5SJeremy L Thompson       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
410e93651e5SJeremy L Thompson 
411e93651e5SJeremy L Thompson       // Restriction was already done for previous input
4120183ed61SJeremy L Thompson       code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
4133a2968d6SJeremy L Thompson     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
4143a2968d6SJeremy L Thompson       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
415e93651e5SJeremy L Thompson         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
4160183ed61SJeremy L Thompson         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
4173a2968d6SJeremy L Thompson       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
418e93651e5SJeremy L Thompson         // Otherwise we're using the scratch space
4190183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
420e93651e5SJeremy L Thompson       }
421f815fac9SJeremy L Thompson       switch (rstr_type) {
422f815fac9SJeremy L Thompson         case CEED_RESTRICTION_STANDARD: {
4234b3e95d5SJeremy L Thompson           CeedInt comp_stride;
4244b3e95d5SJeremy L Thompson 
4254b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
4260183ed61SJeremy L Thompson           code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
4274b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
4280183ed61SJeremy L Thompson           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
4294b3e95d5SJeremy L Thompson           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
4300183ed61SJeremy L Thompson           code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
4310183ed61SJeremy L Thompson                << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
4320183ed61SJeremy L Thompson                << ");\n";
433f815fac9SJeremy L Thompson           break;
434f815fac9SJeremy L Thompson         }
435f815fac9SJeremy L Thompson         case CEED_RESTRICTION_STRIDED: {
4364b3e95d5SJeremy L Thompson           bool    has_backend_strides;
4374b3e95d5SJeremy L Thompson           CeedInt num_elem;
4384b3e95d5SJeremy L Thompson 
4394b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
4404b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
4414b3e95d5SJeremy L Thompson           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
4424b3e95d5SJeremy L Thompson 
4434b3e95d5SJeremy L Thompson           if (!has_backend_strides) {
4444b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
4454b3e95d5SJeremy L Thompson           }
4460183ed61SJeremy L Thompson           code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
4470183ed61SJeremy L Thompson                << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
4480183ed61SJeremy L Thompson           code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
4490183ed61SJeremy L Thompson                << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
4500183ed61SJeremy L Thompson                << var_suffix << ");\n";
451f815fac9SJeremy L Thompson           break;
452f815fac9SJeremy L Thompson         }
4533a2968d6SJeremy L Thompson         case CEED_RESTRICTION_POINTS: {
4543a2968d6SJeremy L Thompson           CeedInt comp_stride;
4553a2968d6SJeremy L Thompson 
4563a2968d6SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
4570183ed61SJeremy L Thompson           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
4583a2968d6SJeremy L Thompson           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
4593a2968d6SJeremy L Thompson           break;
4603a2968d6SJeremy L Thompson         }
461f815fac9SJeremy L Thompson         // LCOV_EXCL_START
462f815fac9SJeremy L Thompson         case CEED_RESTRICTION_ORIENTED:
463f815fac9SJeremy L Thompson         case CEED_RESTRICTION_CURL_ORIENTED:
464f815fac9SJeremy L Thompson           break;  // TODO: Not implemented
465f815fac9SJeremy L Thompson                   // LCOV_EXCL_STOP
4664b3e95d5SJeremy L Thompson       }
4674b3e95d5SJeremy L Thompson     }
4684b3e95d5SJeremy L Thompson   } else {
4694b3e95d5SJeremy L Thompson     // Output
470f815fac9SJeremy L Thompson     switch (rstr_type) {
471f815fac9SJeremy L Thompson       case CEED_RESTRICTION_STANDARD: {
4724b3e95d5SJeremy L Thompson         CeedInt comp_stride;
4734b3e95d5SJeremy L Thompson 
4744b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
4750183ed61SJeremy L Thompson         code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
4764b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
4770183ed61SJeremy L Thompson         code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
4784b3e95d5SJeremy L Thompson         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
4790183ed61SJeremy L Thompson         code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
4800183ed61SJeremy L Thompson              << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
4810183ed61SJeremy L Thompson              << ");\n";
482f815fac9SJeremy L Thompson         break;
483f815fac9SJeremy L Thompson       }
484f815fac9SJeremy L Thompson       case CEED_RESTRICTION_STRIDED: {
4854b3e95d5SJeremy L Thompson         bool    has_backend_strides;
4864b3e95d5SJeremy L Thompson         CeedInt num_elem;
4874b3e95d5SJeremy L Thompson 
4884b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
4894b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
4904b3e95d5SJeremy L Thompson         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
4914b3e95d5SJeremy L Thompson 
4924b3e95d5SJeremy L Thompson         if (!has_backend_strides) {
4934b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
4944b3e95d5SJeremy L Thompson         }
4950183ed61SJeremy L Thompson         code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
4960183ed61SJeremy L Thompson              << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
4970183ed61SJeremy L Thompson         code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
4980183ed61SJeremy L Thompson              << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
4990183ed61SJeremy L Thompson              << ");\n";
500f815fac9SJeremy L Thompson         break;
501f815fac9SJeremy L Thompson       }
5023a2968d6SJeremy L Thompson       case CEED_RESTRICTION_POINTS:
5033a2968d6SJeremy L Thompson         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
5043a2968d6SJeremy L Thompson         break;
505f815fac9SJeremy L Thompson       // LCOV_EXCL_START
506f815fac9SJeremy L Thompson       case CEED_RESTRICTION_ORIENTED:
507f815fac9SJeremy L Thompson       case CEED_RESTRICTION_CURL_ORIENTED:
508f815fac9SJeremy L Thompson         break;  // TODO: Not implemented
509f815fac9SJeremy L Thompson                 // LCOV_EXCL_STOP
5104b3e95d5SJeremy L Thompson     }
5114b3e95d5SJeremy L Thompson   }
5123a2968d6SJeremy L Thompson   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
5134b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
5144b3e95d5SJeremy L Thompson }
5154b3e95d5SJeremy L Thompson 
5164b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
5174b3e95d5SJeremy L Thompson // Basis
5184b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
5190183ed61SJeremy L Thompson static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field,
52074398b5aSJeremy L Thompson                                                 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
5213a2968d6SJeremy L Thompson                                                 bool is_at_points, bool use_3d_slices) {
52274398b5aSJeremy L Thompson   bool      is_tensor = true;
52374398b5aSJeremy L Thompson   CeedBasis basis;
52474398b5aSJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
52574398b5aSJeremy L Thompson   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
52674398b5aSJeremy L Thompson 
5274b3e95d5SJeremy L Thompson   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
5289123fb08SJeremy L Thompson   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
5294b3e95d5SJeremy L Thompson   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
53074398b5aSJeremy L Thompson   CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
5314b3e95d5SJeremy L Thompson   CeedElemRestriction elem_rstr;
5324b3e95d5SJeremy L Thompson 
5334b3e95d5SJeremy L Thompson   // Get field data
5344b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
5354b3e95d5SJeremy L Thompson   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
5364b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
5374b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
5384b3e95d5SJeremy L Thompson   }
5393a2968d6SJeremy L Thompson   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
5404b3e95d5SJeremy L Thompson   if (basis != CEED_BASIS_NONE) {
54174398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
5429123fb08SJeremy L Thompson     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
5439123fb08SJeremy L Thompson     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
5444b3e95d5SJeremy L Thompson   }
5454b3e95d5SJeremy L Thompson   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
5464b3e95d5SJeremy L Thompson 
5474b3e95d5SJeremy L Thompson   // Basis
5480183ed61SJeremy L Thompson   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
5494b3e95d5SJeremy L Thompson   if (is_input) {
5504b3e95d5SJeremy L Thompson     switch (eval_mode) {
5514b3e95d5SJeremy L Thompson       case CEED_EVAL_NONE:
5523a2968d6SJeremy L Thompson         if (!use_3d_slices && !is_at_points) {
5530183ed61SJeremy L Thompson           code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
5544b3e95d5SJeremy L Thompson         }
5554b3e95d5SJeremy L Thompson         break;
5564b3e95d5SJeremy L Thompson       case CEED_EVAL_INTERP:
5573a2968d6SJeremy L Thompson         if (is_at_points) {
5589123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
5599123fb08SJeremy L Thompson 
5600183ed61SJeremy L Thompson           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
5610183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
5626b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
5633a2968d6SJeremy L Thompson         } else {
56474398b5aSJeremy L Thompson           std::string function_name = is_tensor
56574398b5aSJeremy L Thompson                                           ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
56674398b5aSJeremy L Thompson                                           : "InterpNonTensor";
56774398b5aSJeremy L Thompson           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
5689123fb08SJeremy L Thompson 
5690183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
5700183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
57174398b5aSJeremy L Thompson                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
5723a2968d6SJeremy L Thompson         }
5734b3e95d5SJeremy L Thompson         break;
5744b3e95d5SJeremy L Thompson       case CEED_EVAL_GRAD:
5753a2968d6SJeremy L Thompson         if (is_at_points) {
5769123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
5779123fb08SJeremy L Thompson 
5780183ed61SJeremy L Thompson           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
5790183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
5806b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
5813a2968d6SJeremy L Thompson         } else if (use_3d_slices) {
5829123fb08SJeremy L Thompson           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
5839123fb08SJeremy L Thompson 
5840183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
5850183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
5866b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
5879123fb08SJeremy L Thompson         } else if (is_tensor) {
5889123fb08SJeremy L Thompson           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
58974398b5aSJeremy L Thompson           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
59074398b5aSJeremy L Thompson                                       (is_all_tensor ? "" : "Flattened");
59174398b5aSJeremy L Thompson           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
5929123fb08SJeremy L Thompson 
5930183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
59474398b5aSJeremy L Thompson                << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
5950183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
59674398b5aSJeremy L Thompson                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
5974b3e95d5SJeremy L Thompson         } else {
5989123fb08SJeremy L Thompson           std::string function_name = "GradNonTensor";
5999123fb08SJeremy L Thompson 
6000183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
6010183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
60274398b5aSJeremy L Thompson                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
6034b3e95d5SJeremy L Thompson         }
6044b3e95d5SJeremy L Thompson         break;
6054b3e95d5SJeremy L Thompson       case CEED_EVAL_WEIGHT: {
6063a2968d6SJeremy L Thompson         if (is_at_points) {
6070183ed61SJeremy L Thompson           code << tab << "// Nothing to do AtPoints\n";
6083a2968d6SJeremy L Thompson         } else {
6094b3e95d5SJeremy L Thompson           CeedBasis_Hip_shared *basis_data;
61074398b5aSJeremy L Thompson           std::string           function_name = is_tensor
61174398b5aSJeremy L Thompson                                                     ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
61274398b5aSJeremy L Thompson                                                     : "WeightNonTensor";
6134b3e95d5SJeremy L Thompson 
6140183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
6154b3e95d5SJeremy L Thompson           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
6164b3e95d5SJeremy L Thompson           data->W = basis_data->d_q_weight_1d;
6170183ed61SJeremy L Thompson           code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
6183a2968d6SJeremy L Thompson         }
6194b3e95d5SJeremy L Thompson         break;
6204b3e95d5SJeremy L Thompson       }
6214b3e95d5SJeremy L Thompson       // LCOV_EXCL_START
6224b3e95d5SJeremy L Thompson       case CEED_EVAL_DIV:
6234b3e95d5SJeremy L Thompson       case CEED_EVAL_CURL:
6244b3e95d5SJeremy L Thompson         break;  // TODO: Not implemented
6254b3e95d5SJeremy L Thompson                 // LCOV_EXCL_STOP
6264b3e95d5SJeremy L Thompson     }
6274b3e95d5SJeremy L Thompson   } else {
6284b3e95d5SJeremy L Thompson     switch (eval_mode) {
6294b3e95d5SJeremy L Thompson       case CEED_EVAL_NONE:
6300183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
6314b3e95d5SJeremy L Thompson         break;  // No action
6324b3e95d5SJeremy L Thompson       case CEED_EVAL_INTERP:
6330183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
6343a2968d6SJeremy L Thompson         if (is_at_points) {
6359123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
6369123fb08SJeremy L Thompson 
6370183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
6386b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
6393a2968d6SJeremy L Thompson         } else {
6409123fb08SJeremy L Thompson           std::string function_name =
64174398b5aSJeremy L Thompson               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
64274398b5aSJeremy L Thompson                         : "InterpTransposeNonTensor";
64374398b5aSJeremy L Thompson           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
6449123fb08SJeremy L Thompson 
6450183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
64674398b5aSJeremy L Thompson                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
6473a2968d6SJeremy L Thompson         }
6484b3e95d5SJeremy L Thompson         break;
6494b3e95d5SJeremy L Thompson       case CEED_EVAL_GRAD:
6500183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
6513a2968d6SJeremy L Thompson         if (is_at_points) {
6529123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
6539123fb08SJeremy L Thompson 
6540183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
6556b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
6563a2968d6SJeremy L Thompson         } else if (use_3d_slices) {
6579123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
6589123fb08SJeremy L Thompson 
6590183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
6606b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
6619123fb08SJeremy L Thompson         } else if (is_tensor) {
6629123fb08SJeremy L Thompson           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
66374398b5aSJeremy L Thompson           std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
66474398b5aSJeremy L Thompson                                       std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
66574398b5aSJeremy L Thompson           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
6669123fb08SJeremy L Thompson 
6670183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
66874398b5aSJeremy L Thompson                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
6694b3e95d5SJeremy L Thompson         } else {
6709123fb08SJeremy L Thompson           std::string function_name = "GradTransposeNonTensor";
6719123fb08SJeremy L Thompson 
6720183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
67374398b5aSJeremy L Thompson                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
6744b3e95d5SJeremy L Thompson         }
6754b3e95d5SJeremy L Thompson         break;
6764b3e95d5SJeremy L Thompson       // LCOV_EXCL_START
6774b3e95d5SJeremy L Thompson       case CEED_EVAL_WEIGHT:
6784b3e95d5SJeremy L Thompson         break;  // Should not occur
6794b3e95d5SJeremy L Thompson       case CEED_EVAL_DIV:
6804b3e95d5SJeremy L Thompson       case CEED_EVAL_CURL:
6814b3e95d5SJeremy L Thompson         break;  // TODO: Not implemented
6824b3e95d5SJeremy L Thompson                 // LCOV_EXCL_STOP
6834b3e95d5SJeremy L Thompson     }
6844b3e95d5SJeremy L Thompson   }
6853a2968d6SJeremy L Thompson   CeedCallBackend(CeedBasisDestroy(&basis));
6864b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
6874b3e95d5SJeremy L Thompson }
6884b3e95d5SJeremy L Thompson 
6894b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
6904b3e95d5SJeremy L Thompson // QFunction
6914b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
6920183ed61SJeremy L Thompson static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim,
6930183ed61SJeremy L Thompson                                                     CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
6940183ed61SJeremy L Thompson                                                     CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
6950183ed61SJeremy L Thompson                                                     CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
6960183ed61SJeremy L Thompson                                                     std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
6970183ed61SJeremy L Thompson                                                     bool use_3d_slices) {
69874398b5aSJeremy L Thompson   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
6994b3e95d5SJeremy L Thompson   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
7004b3e95d5SJeremy L Thompson   CeedElemRestriction elem_rstr;
7014b3e95d5SJeremy L Thompson 
7028b97b69aSJeremy L Thompson   // Setup output arrays
7030183ed61SJeremy L Thompson   code << "\n";
7040183ed61SJeremy L Thompson   code << tab << "// -- Output field setup\n";
7054b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
70659fa3f92SJeremy L Thompson     const char *field_name;
7074b3e95d5SJeremy L Thompson     std::string var_suffix = "_out_" + std::to_string(i);
7084b3e95d5SJeremy L Thompson 
70959fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
7100183ed61SJeremy L Thompson     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
7114b3e95d5SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
7123a2968d6SJeremy L Thompson     switch (eval_mode) {
7133a2968d6SJeremy L Thompson       case CEED_EVAL_NONE:
7143a2968d6SJeremy L Thompson         if (is_at_points) {
7150183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
7163a2968d6SJeremy L Thompson         } else {
7170183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
71874398b5aSJeremy L Thompson                << "];\n";
7194b3e95d5SJeremy L Thompson         }
7203a2968d6SJeremy L Thompson         break;
7213a2968d6SJeremy L Thompson       case CEED_EVAL_INTERP:
7223a2968d6SJeremy L Thompson         if (is_at_points) {
7233a2968d6SJeremy L Thompson           // Accumulator for point data
7240183ed61SJeremy L Thompson           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
7250183ed61SJeremy L Thompson           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
726b8245c6cSJeremy L Thompson                << "[i] = 0.0;\n";
7273a2968d6SJeremy L Thompson         } else {
7280183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
72974398b5aSJeremy L Thompson                << "];\n";
7303a2968d6SJeremy L Thompson         }
7313a2968d6SJeremy L Thompson         break;
7323a2968d6SJeremy L Thompson       case CEED_EVAL_GRAD:
7333a2968d6SJeremy L Thompson         if (is_at_points) {
7343a2968d6SJeremy L Thompson           // Accumulator for point data
7350183ed61SJeremy L Thompson           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
7360183ed61SJeremy L Thompson           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
737b8245c6cSJeremy L Thompson                << "[i] = 0.0;\n";
7383a2968d6SJeremy L Thompson         } else if (use_3d_slices) {
7394b3e95d5SJeremy L Thompson           // Accumulator for gradient slices
7400183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
7410183ed61SJeremy L Thompson           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
7424b3e95d5SJeremy L Thompson         } else {
7430183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
74474398b5aSJeremy L Thompson                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
7454b3e95d5SJeremy L Thompson         }
7463a2968d6SJeremy L Thompson         break;
7473a2968d6SJeremy L Thompson       case CEED_EVAL_WEIGHT:
7483a2968d6SJeremy L Thompson         break;
7493a2968d6SJeremy L Thompson         // LCOV_EXCL_START
7503a2968d6SJeremy L Thompson       case CEED_EVAL_DIV:
7513a2968d6SJeremy L Thompson       case CEED_EVAL_CURL:
7523a2968d6SJeremy L Thompson         break;  // TODO: Not implemented
7533a2968d6SJeremy L Thompson                 // LCOV_EXCL_STOP
7544b3e95d5SJeremy L Thompson     }
7554b3e95d5SJeremy L Thompson   }
7564b3e95d5SJeremy L Thompson 
7573a2968d6SJeremy L Thompson   if (is_at_points) {
7583a2968d6SJeremy L Thompson     // We need to handle batches of points
7590183ed61SJeremy L Thompson     code << "\n";
7600183ed61SJeremy L Thompson     code << tab << "// Note: Using batches of points\n";
7610183ed61SJeremy L Thompson     code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
7620183ed61SJeremy L Thompson     code << tab << "#pragma unroll\n";
7630183ed61SJeremy L Thompson     code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
7640183ed61SJeremy L Thompson     tab.push();
7650183ed61SJeremy L Thompson     code << tab << "const CeedInt p = i % max_num_points;\n\n";
7663a2968d6SJeremy L Thompson 
7670183ed61SJeremy L Thompson     code << tab << "// -- Coordinates\n";
7680183ed61SJeremy L Thompson     code << tab << "CeedScalar r_x[max_dim];\n";
7690183ed61SJeremy L Thompson     code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
7703a2968d6SJeremy L Thompson 
7710183ed61SJeremy L Thompson     code << tab << "// -- Input fields\n";
7723a2968d6SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
77359fa3f92SJeremy L Thompson       const char *field_name;
7743a2968d6SJeremy L Thompson       std::string var_suffix = "_in_" + std::to_string(i);
775f725b54bSJeremy L Thompson       std::string P_name     = "P_1d" + var_suffix;
7763a2968d6SJeremy L Thompson 
77759fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
7780183ed61SJeremy L Thompson       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
7793a2968d6SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
7803a2968d6SJeremy L Thompson       // Basis action
7810183ed61SJeremy L Thompson       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
7823a2968d6SJeremy L Thompson       switch (eval_mode) {
7833a2968d6SJeremy L Thompson         case CEED_EVAL_NONE:
7840183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
7850183ed61SJeremy L Thompson           code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
7863a2968d6SJeremy L Thompson                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
7873a2968d6SJeremy L Thompson           break;
7883a2968d6SJeremy L Thompson         case CEED_EVAL_INTERP:
7890183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
7900183ed61SJeremy L Thompson           code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
79174398b5aSJeremy L Thompson                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
7923a2968d6SJeremy L Thompson           break;
7933a2968d6SJeremy L Thompson         case CEED_EVAL_GRAD:
7940183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
7950183ed61SJeremy L Thompson           code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
79674398b5aSJeremy L Thompson                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
7973a2968d6SJeremy L Thompson           break;
7983a2968d6SJeremy L Thompson         case CEED_EVAL_WEIGHT:
7990183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
8000183ed61SJeremy L Thompson           code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
8013a2968d6SJeremy L Thompson           break;
8023a2968d6SJeremy L Thompson           // LCOV_EXCL_START
8033a2968d6SJeremy L Thompson         case CEED_EVAL_DIV:
8043a2968d6SJeremy L Thompson         case CEED_EVAL_CURL:
8053a2968d6SJeremy L Thompson           break;  // TODO: Not implemented
8063a2968d6SJeremy L Thompson                   // LCOV_EXCL_STOP
8073a2968d6SJeremy L Thompson       }
8083a2968d6SJeremy L Thompson     }
8090183ed61SJeremy L Thompson     code << "\n";
8100183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
8113a2968d6SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
81259fa3f92SJeremy L Thompson       const char *field_name;
8133a2968d6SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
8143a2968d6SJeremy L Thompson 
81559fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
8160183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
8173a2968d6SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
8183a2968d6SJeremy L Thompson       // Basis action
8193a2968d6SJeremy L Thompson       switch (eval_mode) {
8203a2968d6SJeremy L Thompson         case CEED_EVAL_NONE:
8210183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
8223a2968d6SJeremy L Thompson           break;
8233a2968d6SJeremy L Thompson         case CEED_EVAL_INTERP:
8240183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
8253a2968d6SJeremy L Thompson           break;
8263a2968d6SJeremy L Thompson         case CEED_EVAL_GRAD:
8270183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
8283a2968d6SJeremy L Thompson           break;
8293a2968d6SJeremy L Thompson           // LCOV_EXCL_START
8303a2968d6SJeremy L Thompson         case CEED_EVAL_WEIGHT:
8313a2968d6SJeremy L Thompson           break;  // Should not occur
8323a2968d6SJeremy L Thompson         case CEED_EVAL_DIV:
8333a2968d6SJeremy L Thompson         case CEED_EVAL_CURL:
8343a2968d6SJeremy L Thompson           break;  // TODO: Not implemented
8353a2968d6SJeremy L Thompson                   // LCOV_EXCL_STOP
8363a2968d6SJeremy L Thompson       }
8373a2968d6SJeremy L Thompson     }
8383a2968d6SJeremy L Thompson 
8393a2968d6SJeremy L Thompson   } else if (use_3d_slices) {
8404b3e95d5SJeremy L Thompson     // We treat quadrature points per slice in 3d to save registers
8410183ed61SJeremy L Thompson     code << "\n";
8420183ed61SJeremy L Thompson     code << tab << "// Note: Using planes of 3D elements\n";
8430183ed61SJeremy L Thompson     code << tab << "#pragma unroll\n";
8440183ed61SJeremy L Thompson     code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
8450183ed61SJeremy L Thompson     tab.push();
8460183ed61SJeremy L Thompson     code << tab << "// -- Input fields\n";
8474b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
84859fa3f92SJeremy L Thompson       const char *field_name;
8494b3e95d5SJeremy L Thompson       std::string var_suffix = "_in_" + std::to_string(i);
8504b3e95d5SJeremy L Thompson 
85159fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
8520183ed61SJeremy L Thompson       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
8534b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
8544b3e95d5SJeremy L Thompson       // Basis action
8550183ed61SJeremy L Thompson       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
8564b3e95d5SJeremy L Thompson       switch (eval_mode) {
8574b3e95d5SJeremy L Thompson         case CEED_EVAL_NONE:
8584b3e95d5SJeremy L Thompson           bool is_strided;
8594b3e95d5SJeremy L Thompson 
8600183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
8614b3e95d5SJeremy L Thompson 
8624b3e95d5SJeremy L Thompson           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
8634b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
8644b3e95d5SJeremy L Thompson           if (is_strided) {
8654b3e95d5SJeremy L Thompson             bool    has_backend_strides;
8664b3e95d5SJeremy L Thompson             CeedInt num_elem, elem_size;
8674b3e95d5SJeremy L Thompson 
8684b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
8694b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
8704b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
8714b3e95d5SJeremy L Thompson             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
8724b3e95d5SJeremy L Thompson 
8734b3e95d5SJeremy L Thompson             if (!has_backend_strides) {
8744b3e95d5SJeremy L Thompson               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
8754b3e95d5SJeremy L Thompson             }
8760183ed61SJeremy L Thompson             code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
8770183ed61SJeremy L Thompson                  << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
8780183ed61SJeremy L Thompson             code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
8790183ed61SJeremy L Thompson                  << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
8804b3e95d5SJeremy L Thompson           } else {
8814b3e95d5SJeremy L Thompson             CeedSize                 l_size = 0;
8824b3e95d5SJeremy L Thompson             CeedInt                  comp_stride;
8834b3e95d5SJeremy L Thompson             CeedElemRestriction_Hip *rstr_data;
8844b3e95d5SJeremy L Thompson 
8854b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
8860183ed61SJeremy L Thompson             code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
8874b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
8880183ed61SJeremy L Thompson             code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
8894b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
8904b3e95d5SJeremy L Thompson             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
8910183ed61SJeremy L Thompson             code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
8920183ed61SJeremy L Thompson                  << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
8934b3e95d5SJeremy L Thompson           }
8949123fb08SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
8954b3e95d5SJeremy L Thompson           break;
8964b3e95d5SJeremy L Thompson         case CEED_EVAL_INTERP:
8970183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
8980183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
8990183ed61SJeremy L Thompson           tab.push();
9000183ed61SJeremy L Thompson           code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
9010183ed61SJeremy L Thompson           tab.pop();
9020183ed61SJeremy L Thompson           code << tab << "}\n";
9034b3e95d5SJeremy L Thompson           break;
9044b3e95d5SJeremy L Thompson         case CEED_EVAL_GRAD:
9050183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
9060183ed61SJeremy L Thompson           code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
9076b92dc4bSJeremy L Thompson                << var_suffix << ", r_s" << var_suffix << ");\n";
9084b3e95d5SJeremy L Thompson           break;
9094b3e95d5SJeremy L Thompson         case CEED_EVAL_WEIGHT:
9100183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
9110183ed61SJeremy L Thompson           code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
9123a2968d6SJeremy L Thompson           break;
9134b3e95d5SJeremy L Thompson           // LCOV_EXCL_START
9144b3e95d5SJeremy L Thompson         case CEED_EVAL_DIV:
9154b3e95d5SJeremy L Thompson         case CEED_EVAL_CURL:
9164b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
9174b3e95d5SJeremy L Thompson                   // LCOV_EXCL_STOP
9184b3e95d5SJeremy L Thompson       }
9194b3e95d5SJeremy L Thompson     }
9200183ed61SJeremy L Thompson     code << "\n";
9210183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
9224b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
92359fa3f92SJeremy L Thompson       const char *field_name;
9244b3e95d5SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
9254b3e95d5SJeremy L Thompson 
92659fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
9270183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
9284b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
9294b3e95d5SJeremy L Thompson       // Basis action
9304b3e95d5SJeremy L Thompson       switch (eval_mode) {
9314b3e95d5SJeremy L Thompson         case CEED_EVAL_NONE:
9320183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
9333a2968d6SJeremy L Thompson           break;
9344b3e95d5SJeremy L Thompson         case CEED_EVAL_INTERP:
9350183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
9364b3e95d5SJeremy L Thompson           break;
9374b3e95d5SJeremy L Thompson         case CEED_EVAL_GRAD:
9380183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
9394b3e95d5SJeremy L Thompson           break;
9404b3e95d5SJeremy L Thompson           // LCOV_EXCL_START
9414b3e95d5SJeremy L Thompson         case CEED_EVAL_WEIGHT:
9424b3e95d5SJeremy L Thompson           break;  // Should not occur
9434b3e95d5SJeremy L Thompson         case CEED_EVAL_DIV:
9444b3e95d5SJeremy L Thompson         case CEED_EVAL_CURL:
9454b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
9464b3e95d5SJeremy L Thompson                   // LCOV_EXCL_STOP
9474b3e95d5SJeremy L Thompson       }
9484b3e95d5SJeremy L Thompson     }
9494b3e95d5SJeremy L Thompson   } else {
9500183ed61SJeremy L Thompson     code << "\n";
9510183ed61SJeremy L Thompson     code << tab << "// Note: Using full elements\n";
9520183ed61SJeremy L Thompson     code << tab << "{\n";
9530183ed61SJeremy L Thompson     tab.push();
9540183ed61SJeremy L Thompson     code << tab << "// -- Input fields\n";
9554b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
95659fa3f92SJeremy L Thompson       const char *field_name;
95759fa3f92SJeremy L Thompson 
95859fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
9590183ed61SJeremy L Thompson       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
9600183ed61SJeremy L Thompson       code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
9614b3e95d5SJeremy L Thompson     }
9620183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
9634b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
96459fa3f92SJeremy L Thompson       const char *field_name;
96559fa3f92SJeremy L Thompson 
96659fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
9670183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
9680183ed61SJeremy L Thompson       code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
9694b3e95d5SJeremy L Thompson     }
9704b3e95d5SJeremy L Thompson   }
9714b3e95d5SJeremy L Thompson 
9724b3e95d5SJeremy L Thompson   // Input and output buffers
9730183ed61SJeremy L Thompson   code << "\n";
9740183ed61SJeremy L Thompson   code << tab << "// -- QFunction inputs and outputs\n";
9750183ed61SJeremy L Thompson   code << tab << "// ---- Inputs\n";
9760183ed61SJeremy L Thompson   code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
9774b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
97859fa3f92SJeremy L Thompson     const char *field_name;
97959fa3f92SJeremy L Thompson 
98059fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
9810183ed61SJeremy L Thompson     code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
9820183ed61SJeremy L Thompson     code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
9834b3e95d5SJeremy L Thompson   }
9840183ed61SJeremy L Thompson   code << tab << "// ---- Outputs\n";
9850183ed61SJeremy L Thompson   code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
9864b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
98759fa3f92SJeremy L Thompson     const char *field_name;
98859fa3f92SJeremy L Thompson 
98959fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
9900183ed61SJeremy L Thompson     code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
9910183ed61SJeremy L Thompson     code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
9924b3e95d5SJeremy L Thompson   }
9934b3e95d5SJeremy L Thompson 
9944b3e95d5SJeremy L Thompson   // Apply QFunction
9950183ed61SJeremy L Thompson   code << "\n";
9960183ed61SJeremy L Thompson   code << tab << "// -- Apply QFunction\n";
9970183ed61SJeremy L Thompson   code << tab << "" << qfunction_name << "(ctx, ";
99874398b5aSJeremy L Thompson   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
9994b3e95d5SJeremy L Thompson     code << "1";
10004b3e95d5SJeremy L Thompson   } else {
10019123fb08SJeremy L Thompson     code << Q_name;
10024b3e95d5SJeremy L Thompson   }
10034b3e95d5SJeremy L Thompson   code << ", inputs, outputs);\n";
10044b3e95d5SJeremy L Thompson 
10053a2968d6SJeremy L Thompson   if (is_at_points) {
10063a2968d6SJeremy L Thompson     // Map back to coefficients
10070183ed61SJeremy L Thompson     code << "\n";
10080183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
10093a2968d6SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
101059fa3f92SJeremy L Thompson       const char *field_name;
10113a2968d6SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
10123a2968d6SJeremy L Thompson       std::string P_name     = "P_1d" + var_suffix;
10133a2968d6SJeremy L Thompson 
101459fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
10150183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
10163a2968d6SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
10173a2968d6SJeremy L Thompson       // Basis action
10180183ed61SJeremy L Thompson       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
10193a2968d6SJeremy L Thompson       switch (eval_mode) {
10203a2968d6SJeremy L Thompson         case CEED_EVAL_NONE: {
10213a2968d6SJeremy L Thompson           CeedInt             comp_stride;
10223a2968d6SJeremy L Thompson           CeedElemRestriction elem_rstr;
10233a2968d6SJeremy L Thompson 
10243a2968d6SJeremy L Thompson           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
10253a2968d6SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
10263a2968d6SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
10270183ed61SJeremy L Thompson           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
10280183ed61SJeremy L Thompson           code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
10293a2968d6SJeremy L Thompson                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
10303a2968d6SJeremy L Thompson                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
10313a2968d6SJeremy L Thompson           break;
10323a2968d6SJeremy L Thompson         }
10333a2968d6SJeremy L Thompson         case CEED_EVAL_INTERP:
10340183ed61SJeremy L Thompson           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
10350183ed61SJeremy L Thompson           tab.push();
10360183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
10370183ed61SJeremy L Thompson           tab.pop();
10380183ed61SJeremy L Thompson           code << tab << "}\n";
10390183ed61SJeremy L Thompson           code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1040f725b54bSJeremy L Thompson                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
10413a2968d6SJeremy L Thompson           break;
10423a2968d6SJeremy L Thompson         case CEED_EVAL_GRAD:
10430183ed61SJeremy L Thompson           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
10440183ed61SJeremy L Thompson           tab.push();
10450183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
10460183ed61SJeremy L Thompson           tab.pop();
10470183ed61SJeremy L Thompson           code << tab << "}\n";
10480183ed61SJeremy L Thompson           code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1049f725b54bSJeremy L Thompson                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
10503a2968d6SJeremy L Thompson           break;
10513a2968d6SJeremy L Thompson           // LCOV_EXCL_START
10523a2968d6SJeremy L Thompson         case CEED_EVAL_WEIGHT:
10533a2968d6SJeremy L Thompson           break;  // Should not occur
10543a2968d6SJeremy L Thompson         case CEED_EVAL_DIV:
10553a2968d6SJeremy L Thompson         case CEED_EVAL_CURL:
10563a2968d6SJeremy L Thompson           break;  // TODO: Not implemented
10573a2968d6SJeremy L Thompson                   // LCOV_EXCL_STOP
10583a2968d6SJeremy L Thompson       }
10593a2968d6SJeremy L Thompson     }
10603a2968d6SJeremy L Thompson   } else if (use_3d_slices) {
10614b3e95d5SJeremy L Thompson     // Copy or apply transpose grad, if needed
10620183ed61SJeremy L Thompson     code << "\n";
10630183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
10644b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
106559fa3f92SJeremy L Thompson       const char *field_name;
10664b3e95d5SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
10674b3e95d5SJeremy L Thompson       std::string P_name     = "P_1d" + var_suffix;
10684b3e95d5SJeremy L Thompson 
106959fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
10700183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
10714b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
10724b3e95d5SJeremy L Thompson       // Basis action
10730183ed61SJeremy L Thompson       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
10744b3e95d5SJeremy L Thompson       switch (eval_mode) {
10754b3e95d5SJeremy L Thompson         case CEED_EVAL_NONE:
10760183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
10770183ed61SJeremy L Thompson           tab.push();
10780183ed61SJeremy L Thompson           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
10790183ed61SJeremy L Thompson           tab.pop();
10800183ed61SJeremy L Thompson           code << tab << "}\n";
10813a2968d6SJeremy L Thompson           break;
10824b3e95d5SJeremy L Thompson         case CEED_EVAL_INTERP:
10830183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
10840183ed61SJeremy L Thompson           tab.push();
10850183ed61SJeremy L Thompson           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
10860183ed61SJeremy L Thompson           tab.pop();
10870183ed61SJeremy L Thompson           code << tab << "}\n";
10884b3e95d5SJeremy L Thompson           break;
10894b3e95d5SJeremy L Thompson         case CEED_EVAL_GRAD:
10900183ed61SJeremy L Thompson           code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1091f815fac9SJeremy L Thompson                << var_suffix << ", r_q" << var_suffix << ");\n";
10924b3e95d5SJeremy L Thompson           break;
10934b3e95d5SJeremy L Thompson           // LCOV_EXCL_START
10944b3e95d5SJeremy L Thompson         case CEED_EVAL_WEIGHT:
10954b3e95d5SJeremy L Thompson           break;  // Should not occur
10964b3e95d5SJeremy L Thompson         case CEED_EVAL_DIV:
10974b3e95d5SJeremy L Thompson         case CEED_EVAL_CURL:
10984b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
10994b3e95d5SJeremy L Thompson                   // LCOV_EXCL_STOP
11004b3e95d5SJeremy L Thompson       }
11014b3e95d5SJeremy L Thompson     }
11024b3e95d5SJeremy L Thompson   }
11030183ed61SJeremy L Thompson   tab.pop();
11040183ed61SJeremy L Thompson   code << tab << "}\n";
11054b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
11064b3e95d5SJeremy L Thompson }
11074b3e95d5SJeremy L Thompson 
11084b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
11099e201c85SYohann // Build single operator kernel
11107d8d0e25Snbeams //------------------------------------------------------------------------------
11118d12f40eSJeremy L Thompson extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
111274398b5aSJeremy L Thompson   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
11137d8d0e25Snbeams   Ceed                   ceed;
1114efa41df3SJeremy L Thompson   CeedInt                Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1115b7453713SJeremy L Thompson   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1116b7453713SJeremy L Thompson   CeedQFunction_Hip_gen *qf_data;
1117b7453713SJeremy L Thompson   CeedQFunction          qf;
1118b7453713SJeremy L Thompson   CeedOperatorField     *op_input_fields, *op_output_fields;
1119b7453713SJeremy L Thompson   CeedOperator_Hip_gen  *data;
11204b3e95d5SJeremy L Thompson   std::ostringstream     code;
11210183ed61SJeremy L Thompson   Tab                    tab;
11224b3e95d5SJeremy L Thompson 
11238d12f40eSJeremy L Thompson   CeedCallBackend(CeedOperatorGetData(op, &data));
11244b3e95d5SJeremy L Thompson   {
11254b3e95d5SJeremy L Thompson     bool is_setup_done;
1126b7453713SJeremy L Thompson 
1127b7453713SJeremy L Thompson     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
11288d12f40eSJeremy L Thompson     if (is_setup_done) {
11298d12f40eSJeremy L Thompson       *is_good_build = !data->use_fallback;
11308d12f40eSJeremy L Thompson       return CEED_ERROR_SUCCESS;
11318d12f40eSJeremy L Thompson     }
11324b3e95d5SJeremy L Thompson   }
1133b7453713SJeremy L Thompson 
11348d12f40eSJeremy L Thompson   // Check field compatibility
11358d12f40eSJeremy L Thompson   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
11368d12f40eSJeremy L Thompson   {
113774398b5aSJeremy L Thompson     bool has_shared_bases = true;
11388d12f40eSJeremy L Thompson 
11398d12f40eSJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
11408d12f40eSJeremy L Thompson       CeedBasis basis;
11418d12f40eSJeremy L Thompson 
11428d12f40eSJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
11438d12f40eSJeremy L Thompson       if (basis != CEED_BASIS_NONE) {
11448d12f40eSJeremy L Thompson         bool        is_tensor = true;
11458d12f40eSJeremy L Thompson         const char *resource;
11468d12f40eSJeremy L Thompson         char       *resource_root;
11478d12f40eSJeremy L Thompson         Ceed        basis_ceed;
11488d12f40eSJeremy L Thompson 
11498d12f40eSJeremy L Thompson         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1150c9192acaSJeremy L Thompson         is_all_tensor    = is_all_tensor && is_tensor;
1151c9192acaSJeremy L Thompson         is_all_nontensor = is_all_nontensor && !is_tensor;
11528d12f40eSJeremy L Thompson         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
11538d12f40eSJeremy L Thompson         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
11548d12f40eSJeremy L Thompson         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1155c9192acaSJeremy L Thompson         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
11568d12f40eSJeremy L Thompson         CeedCallBackend(CeedFree(&resource_root));
11578d12f40eSJeremy L Thompson         CeedCallBackend(CeedDestroy(&basis_ceed));
11588d12f40eSJeremy L Thompson       }
11598d12f40eSJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis));
11608d12f40eSJeremy L Thompson     }
11618d12f40eSJeremy L Thompson 
11628d12f40eSJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
11638d12f40eSJeremy L Thompson       CeedBasis basis;
11648d12f40eSJeremy L Thompson 
11658d12f40eSJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
11668d12f40eSJeremy L Thompson       if (basis != CEED_BASIS_NONE) {
11678d12f40eSJeremy L Thompson         bool        is_tensor = true;
11688d12f40eSJeremy L Thompson         const char *resource;
11698d12f40eSJeremy L Thompson         char       *resource_root;
11708d12f40eSJeremy L Thompson         Ceed        basis_ceed;
11718d12f40eSJeremy L Thompson 
11728d12f40eSJeremy L Thompson         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1173c9192acaSJeremy L Thompson         is_all_tensor    = is_all_tensor && is_tensor;
1174c9192acaSJeremy L Thompson         is_all_nontensor = is_all_nontensor && !is_tensor;
11758d12f40eSJeremy L Thompson 
11768d12f40eSJeremy L Thompson         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
11778d12f40eSJeremy L Thompson         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
11788d12f40eSJeremy L Thompson         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1179c9192acaSJeremy L Thompson         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
11808d12f40eSJeremy L Thompson         CeedCallBackend(CeedFree(&resource_root));
11818d12f40eSJeremy L Thompson         CeedCallBackend(CeedDestroy(&basis_ceed));
11828d12f40eSJeremy L Thompson       }
11838d12f40eSJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis));
11848d12f40eSJeremy L Thompson     }
11858d12f40eSJeremy L Thompson     // -- Fallback to ref if not all bases are shared
118674398b5aSJeremy L Thompson     if (!has_shared_bases) {
11878d12f40eSJeremy L Thompson       *is_good_build = false;
11888d12f40eSJeremy L Thompson       return CEED_ERROR_SUCCESS;
11898d12f40eSJeremy L Thompson     }
11908d12f40eSJeremy L Thompson   }
1191b7453713SJeremy L Thompson   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1192b7453713SJeremy L Thompson   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1193b7453713SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1194b7453713SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
11957d8d0e25Snbeams 
11964b3e95d5SJeremy L Thompson   // Get operator data
11973a2968d6SJeremy L Thompson   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
119874398b5aSJeremy L Thompson   {
1199efa41df3SJeremy L Thompson     CeedInt max_P = 0, max_P_1d = 0;
120074398b5aSJeremy L Thompson 
12014b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
120274398b5aSJeremy L Thompson                                                         qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));
120374398b5aSJeremy L Thompson     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
120474398b5aSJeremy L Thompson   }
120574398b5aSJeremy L Thompson   if (max_dim == 0) max_dim = 1;
120674398b5aSJeremy L Thompson   data->dim = max_dim;
12073a2968d6SJeremy L Thompson   if (is_at_points) {
12083a2968d6SJeremy L Thompson     CeedElemRestriction_Hip *rstr_data;
12093a2968d6SJeremy L Thompson     CeedElemRestriction      rstr_points = NULL;
12104b3e95d5SJeremy L Thompson 
12113a2968d6SJeremy L Thompson     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
12123a2968d6SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
12133a2968d6SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
12143a2968d6SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
12153a2968d6SJeremy L Thompson     data->points.indices = (CeedInt *)rstr_data->d_offsets;
12163a2968d6SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
12173a2968d6SJeremy L Thompson   }
12183a2968d6SJeremy L Thompson   if (is_at_points) use_3d_slices = false;
12193a2968d6SJeremy L Thompson   if (Q_1d == 0) {
12203a2968d6SJeremy L Thompson     if (is_at_points) Q_1d = max_num_points;
12213a2968d6SJeremy L Thompson     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
12224b3e95d5SJeremy L Thompson   }
122374398b5aSJeremy L Thompson   if (Q == 0) Q = Q_1d;
122474398b5aSJeremy L Thompson   data->Q    = Q;
12254b3e95d5SJeremy L Thompson   data->Q_1d = Q_1d;
12264b3e95d5SJeremy L Thompson 
12270b454692Sjeremylt   // Check for restriction only identity operator
12284b3e95d5SJeremy L Thompson   {
12294b3e95d5SJeremy L Thompson     bool is_identity_qf;
12304b3e95d5SJeremy L Thompson 
12312b730f8bSJeremy L Thompson     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
12320b454692Sjeremylt     if (is_identity_qf) {
12339e201c85SYohann       CeedEvalMode eval_mode_in, eval_mode_out;
1234b7453713SJeremy L Thompson 
12352b730f8bSJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
12362b730f8bSJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
12376574a04fSJeremy L Thompson       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
12386574a04fSJeremy L Thompson                 "Backend does not implement restriction only identity operators");
12390b454692Sjeremylt     }
12404b3e95d5SJeremy L Thompson   }
1241b2165e7aSSebastian Grimberg 
1242b2165e7aSSebastian Grimberg   // Load basis source files
1243eaf9ad10SZach Atkins   if (!is_all_nontensor) {
12440183ed61SJeremy L Thompson     code << tab << "// Tensor basis source\n";
12450183ed61SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
124674398b5aSJeremy L Thompson   }
124774398b5aSJeremy L Thompson   if (!is_all_tensor) {
12480183ed61SJeremy L Thompson     code << tab << "// Non-tensor basis source\n";
12490183ed61SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
12509123fb08SJeremy L Thompson   }
12519123fb08SJeremy L Thompson   if (is_at_points) {
12520183ed61SJeremy L Thompson     code << tab << "// AtPoints basis source\n";
12530183ed61SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
12549123fb08SJeremy L Thompson   }
125574398b5aSJeremy L Thompson   if (!is_all_tensor && !is_all_nontensor) {
12560183ed61SJeremy L Thompson     code << tab << "// Tensor basis source\n";
12570183ed61SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
125874398b5aSJeremy L Thompson   }
12590183ed61SJeremy L Thompson   code << tab << "// CodeGen operator source\n";
12600183ed61SJeremy L Thompson   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
12617d8d0e25Snbeams 
12624b3e95d5SJeremy L Thompson   // Get QFunction name
12634b3e95d5SJeremy L Thompson   std::string qfunction_name(qf_data->qfunction_name);
12644b3e95d5SJeremy L Thompson   std::string operator_name;
12654b3e95d5SJeremy L Thompson 
126609095acaSJeremy L Thompson   operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
12677d8d0e25Snbeams 
12689e201c85SYohann   // Define CEED_Q_VLA
12690183ed61SJeremy L Thompson   code << "\n" << tab << "#undef CEED_Q_VLA\n";
127074398b5aSJeremy L Thompson   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
12710183ed61SJeremy L Thompson     code << tab << "#define CEED_Q_VLA 1\n\n";
12729e201c85SYohann   } else {
12730183ed61SJeremy L Thompson     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
12749e201c85SYohann   }
12759e201c85SYohann 
12764b3e95d5SJeremy L Thompson   // Add user QFunction source
12774b3e95d5SJeremy L Thompson   {
12789c25dd66SJeremy L Thompson     const char *source_path;
12794b3e95d5SJeremy L Thompson 
12809c25dd66SJeremy L Thompson     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
12819c25dd66SJeremy L Thompson     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
12829c25dd66SJeremy L Thompson 
12830183ed61SJeremy L Thompson     code << tab << "// User QFunction source\n";
12840183ed61SJeremy L Thompson     code << tab << "#include \"" << source_path << "\"\n\n";
12854b3e95d5SJeremy L Thompson   }
12867d8d0e25Snbeams 
12877d8d0e25Snbeams   // Setup
12880183ed61SJeremy L Thompson   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
12890183ed61SJeremy L Thompson   code << tab << "// Operator Kernel\n";
12900183ed61SJeremy L Thompson   code << tab << "// \n";
12910183ed61SJeremy L Thompson   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
12920183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_e_i: Element vector register\n";
12930183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
12940183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
12950183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
12960183ed61SJeremy L Thompson   code << tab << "// \n";
12970183ed61SJeremy L Thompson   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
12980183ed61SJeremy L Thompson   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
12990183ed61SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n";
13000183ed61SJeremy L Thompson   code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
13012b730f8bSJeremy L Thompson   code << "__global__ void " << operator_name
13023a2968d6SJeremy L Thompson        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
13030183ed61SJeremy L Thompson   tab.push();
13044b3e95d5SJeremy L Thompson 
13054b3e95d5SJeremy L Thompson   // Scratch buffers
13069e201c85SYohann   for (CeedInt i = 0; i < num_input_fields; i++) {
13074b3e95d5SJeremy L Thompson     CeedEvalMode eval_mode;
13084b3e95d5SJeremy L Thompson 
13092b730f8bSJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
13109e201c85SYohann     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
13110183ed61SJeremy L Thompson       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
13127d8d0e25Snbeams     }
13137d8d0e25Snbeams   }
13149e201c85SYohann   for (CeedInt i = 0; i < num_output_fields; i++) {
13150183ed61SJeremy L Thompson     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
13167d8d0e25Snbeams   }
13177d8d0e25Snbeams 
13180183ed61SJeremy L Thompson   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
131974398b5aSJeremy L Thompson   if (!is_all_tensor) {
13200183ed61SJeremy L Thompson     code << tab << "const CeedInt Q = " << Q << ";\n";
132174398b5aSJeremy L Thompson   }
132274398b5aSJeremy L Thompson   if (!is_all_nontensor) {
13230183ed61SJeremy L Thompson     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
132474398b5aSJeremy L Thompson   }
13253a2968d6SJeremy L Thompson   if (is_at_points) {
13260183ed61SJeremy L Thompson     code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
13270183ed61SJeremy L Thompson     code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
13283a2968d6SJeremy L Thompson   }
13297d8d0e25Snbeams 
13304b3e95d5SJeremy L Thompson   // Shared data
13310183ed61SJeremy L Thompson   code << tab << "extern __shared__ CeedScalar slice[];\n";
13320183ed61SJeremy L Thompson   code << tab << "SharedData_Hip data;\n";
13330183ed61SJeremy L Thompson   code << tab << "data.t_id_x = threadIdx.x;\n";
13340183ed61SJeremy L Thompson   code << tab << "data.t_id_y = threadIdx.y;\n";
13350183ed61SJeremy L Thompson   code << tab << "data.t_id_z = threadIdx.z;\n";
13360183ed61SJeremy L Thompson   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
13370183ed61SJeremy L Thompson   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
13387d8d0e25Snbeams 
13399ee499e5SJeremy L Thompson   // -- Determine input mat reuse
134045a787f7SJeremy L Thompson   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
13419ee499e5SJeremy L Thompson 
13429ee499e5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
134345a787f7SJeremy L Thompson     input_matrix_reuse[i].index = -1;
13449ee499e5SJeremy L Thompson   }
13459ee499e5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
134674398b5aSJeremy L Thompson     bool         is_tensor = true;
13479ee499e5SJeremy L Thompson     CeedEvalMode eval_mode_i;
13489ee499e5SJeremy L Thompson     CeedBasis    basis_i;
13499ee499e5SJeremy L Thompson 
13509ee499e5SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
13519ee499e5SJeremy L Thompson     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
13529ee499e5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
135374398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
135445a787f7SJeremy L Thompson     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
13559ee499e5SJeremy L Thompson       CeedEvalMode eval_mode_j;
13569ee499e5SJeremy L Thompson       CeedBasis    basis_j;
13579ee499e5SJeremy L Thompson 
13589ee499e5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
13599ee499e5SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
13609ee499e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
13619ee499e5SJeremy L Thompson       if (basis_i == basis_j) {
13629ee499e5SJeremy L Thompson         if (is_tensor) {
136345a787f7SJeremy L Thompson           input_matrix_reuse[i].index     = j;
136445a787f7SJeremy L Thompson           input_matrix_reuse[i].is_input  = true;
136545a787f7SJeremy L Thompson           input_matrix_reuse[i].eval_mode = eval_mode_j;
13669ee499e5SJeremy L Thompson         } else {
13679ee499e5SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
13689ee499e5SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
136945a787f7SJeremy L Thompson             input_matrix_reuse[i].index     = j;
137045a787f7SJeremy L Thompson             input_matrix_reuse[i].is_input  = true;
137145a787f7SJeremy L Thompson             input_matrix_reuse[i].eval_mode = eval_mode_j;
13729ee499e5SJeremy L Thompson           }
13739ee499e5SJeremy L Thompson         }
13749ee499e5SJeremy L Thompson       }
13759ee499e5SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
13769ee499e5SJeremy L Thompson     }
13779ee499e5SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
13789ee499e5SJeremy L Thompson   }
13799ee499e5SJeremy L Thompson 
13809ee499e5SJeremy L Thompson   // -- Determine output mat reuse
138145a787f7SJeremy L Thompson   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
13829ee499e5SJeremy L Thompson 
13839ee499e5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
138445a787f7SJeremy L Thompson     output_matrix_reuse[i].index = -1;
13859ee499e5SJeremy L Thompson   }
13869ee499e5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
138774398b5aSJeremy L Thompson     bool         is_tensor = true;
13889ee499e5SJeremy L Thompson     CeedEvalMode eval_mode_i;
13899ee499e5SJeremy L Thompson     CeedBasis    basis_i;
13909ee499e5SJeremy L Thompson 
13919ee499e5SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
13929ee499e5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
139345a787f7SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
13949ee499e5SJeremy L Thompson       CeedEvalMode eval_mode_j;
13959ee499e5SJeremy L Thompson       CeedBasis    basis_j;
13969ee499e5SJeremy L Thompson 
13979ee499e5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
13989ee499e5SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
13999ee499e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
14009ee499e5SJeremy L Thompson       if (basis_i == basis_j) {
14019ee499e5SJeremy L Thompson         if (is_tensor) {
140245a787f7SJeremy L Thompson           output_matrix_reuse[i].index     = j;
140345a787f7SJeremy L Thompson           output_matrix_reuse[i].is_input  = true;
140445a787f7SJeremy L Thompson           output_matrix_reuse[i].eval_mode = eval_mode_j;
14059ee499e5SJeremy L Thompson         } else {
14069ee499e5SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
14079ee499e5SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
140845a787f7SJeremy L Thompson             output_matrix_reuse[i].index     = j;
140945a787f7SJeremy L Thompson             output_matrix_reuse[i].is_input  = true;
141045a787f7SJeremy L Thompson             output_matrix_reuse[i].eval_mode = eval_mode_j;
14119ee499e5SJeremy L Thompson           }
14129ee499e5SJeremy L Thompson         }
14139ee499e5SJeremy L Thompson       }
14149ee499e5SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
14159ee499e5SJeremy L Thompson     }
141645a787f7SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
14179ee499e5SJeremy L Thompson       CeedEvalMode eval_mode_j;
14189ee499e5SJeremy L Thompson       CeedBasis    basis_j;
14199ee499e5SJeremy L Thompson 
14209ee499e5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
14219ee499e5SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
14229ee499e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
142374398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
14249ee499e5SJeremy L Thompson       if (basis_i == basis_j) {
14259ee499e5SJeremy L Thompson         if (is_tensor) {
142645a787f7SJeremy L Thompson           output_matrix_reuse[i].index     = j;
142745a787f7SJeremy L Thompson           output_matrix_reuse[i].is_input  = false;
142845a787f7SJeremy L Thompson           output_matrix_reuse[i].eval_mode = eval_mode_j;
14299ee499e5SJeremy L Thompson         } else {
14309ee499e5SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
14319ee499e5SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
143245a787f7SJeremy L Thompson             output_matrix_reuse[i].index     = j;
143345a787f7SJeremy L Thompson             output_matrix_reuse[i].is_input  = false;
143445a787f7SJeremy L Thompson             output_matrix_reuse[i].eval_mode = eval_mode_j;
14359ee499e5SJeremy L Thompson           }
14369ee499e5SJeremy L Thompson         }
14379ee499e5SJeremy L Thompson       }
14389ee499e5SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
14399ee499e5SJeremy L Thompson     }
14409ee499e5SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
14419ee499e5SJeremy L Thompson   }
14429ee499e5SJeremy L Thompson 
14437d8d0e25Snbeams   // Initialize constants, and matrices B and G
14440183ed61SJeremy L Thompson   code << "\n" << tab << "// Input field constants and basis data\n";
14459e201c85SYohann   for (CeedInt i = 0; i < num_input_fields; i++) {
14460183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
14470183ed61SJeremy L Thompson                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
14487d8d0e25Snbeams   }
14490183ed61SJeremy L Thompson   code << "\n" << tab << "// Output field constants and basis data\n";
14509e201c85SYohann   for (CeedInt i = 0; i < num_output_fields; i++) {
14510183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
14520183ed61SJeremy L Thompson                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
14534b3e95d5SJeremy L Thompson   }
14547d8d0e25Snbeams 
14554b3e95d5SJeremy L Thompson   // Loop over all elements
14560183ed61SJeremy L Thompson   code << "\n" << tab << "// Element loop\n";
14570183ed61SJeremy L Thompson   code << tab << "__syncthreads();\n";
14580183ed61SJeremy L Thompson   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
14590183ed61SJeremy L Thompson   tab.push();
14604b3e95d5SJeremy L Thompson 
1461e93651e5SJeremy L Thompson   // -- Compute minimum buffer space needed
14623a2968d6SJeremy L Thompson   CeedInt max_rstr_buffer_size = 1;
1463e93651e5SJeremy L Thompson 
1464e93651e5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
14656de40545SJeremy L Thompson     CeedEvalMode eval_mode;
14666de40545SJeremy L Thompson 
14676de40545SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
14686de40545SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1469a61b1c91SJeremy L Thompson       CeedInt             num_comp;
1470e93651e5SJeremy L Thompson       CeedElemRestriction elem_rstr;
1471e93651e5SJeremy L Thompson 
1472e93651e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1473e93651e5SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1474a61b1c91SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1475681d0ea7SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1476e93651e5SJeremy L Thompson     }
14776de40545SJeremy L Thompson   }
1478e93651e5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
14796de40545SJeremy L Thompson     CeedEvalMode eval_mode;
14806de40545SJeremy L Thompson 
14816de40545SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
14826de40545SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE) {
1483a61b1c91SJeremy L Thompson       CeedInt             num_comp;
1484e93651e5SJeremy L Thompson       CeedElemRestriction elem_rstr;
1485e93651e5SJeremy L Thompson 
1486e93651e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1487e93651e5SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1488a61b1c91SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1489681d0ea7SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1490e93651e5SJeremy L Thompson     }
14916de40545SJeremy L Thompson   }
14920183ed61SJeremy L Thompson   code << tab << "// Scratch restriction buffer space\n";
14930183ed61SJeremy L Thompson   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1494e93651e5SJeremy L Thompson 
1495e93651e5SJeremy L Thompson   // -- Determine best input field processing order
1496e93651e5SJeremy L Thompson   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1497e93651e5SJeremy L Thompson 
1498e93651e5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
1499e93651e5SJeremy L Thompson     field_rstr_in_buffer[i] = -1;
1500e93651e5SJeremy L Thompson     input_field_order[i]    = -1;
1501e93651e5SJeremy L Thompson   }
1502e93651e5SJeremy L Thompson   {
1503e93651e5SJeremy L Thompson     bool    is_ordered[CEED_FIELD_MAX];
1504e93651e5SJeremy L Thompson     CeedInt curr_index = 0;
1505e93651e5SJeremy L Thompson 
1506e93651e5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1507e93651e5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
1508e93651e5SJeremy L Thompson       CeedVector          vec_i;
1509e93651e5SJeremy L Thompson       CeedElemRestriction rstr_i;
1510e93651e5SJeremy L Thompson 
1511e93651e5SJeremy L Thompson       if (is_ordered[i]) continue;
1512e93651e5SJeremy L Thompson       field_rstr_in_buffer[i]       = i;
1513e93651e5SJeremy L Thompson       is_ordered[i]                 = true;
1514e93651e5SJeremy L Thompson       input_field_order[curr_index] = i;
1515e93651e5SJeremy L Thompson       curr_index++;
1516034f99fdSJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1517e93651e5SJeremy L Thompson       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1518e93651e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1519e93651e5SJeremy L Thompson       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1520e93651e5SJeremy L Thompson         CeedVector          vec_j;
1521e93651e5SJeremy L Thompson         CeedElemRestriction rstr_j;
1522e93651e5SJeremy L Thompson 
1523e93651e5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1524e93651e5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1525e93651e5SJeremy L Thompson         if (rstr_i == rstr_j && vec_i == vec_j) {
1526e93651e5SJeremy L Thompson           field_rstr_in_buffer[j]       = i;
1527e93651e5SJeremy L Thompson           is_ordered[j]                 = true;
1528e93651e5SJeremy L Thompson           input_field_order[curr_index] = j;
1529e93651e5SJeremy L Thompson           curr_index++;
1530e93651e5SJeremy L Thompson         }
15313a2968d6SJeremy L Thompson         CeedCallBackend(CeedVectorDestroy(&vec_j));
15323a2968d6SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1533e93651e5SJeremy L Thompson       }
15343a2968d6SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec_i));
15353a2968d6SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1536e93651e5SJeremy L Thompson     }
1537e93651e5SJeremy L Thompson   }
1538e93651e5SJeremy L Thompson 
15394b3e95d5SJeremy L Thompson   // -- Input restriction and basis
15400183ed61SJeremy L Thompson   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
15419e201c85SYohann   for (CeedInt i = 0; i < num_input_fields; i++) {
154259fa3f92SJeremy L Thompson     const char   *field_name;
154359fa3f92SJeremy L Thompson     const CeedInt f = input_field_order[i];
1544e93651e5SJeremy L Thompson 
154559fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
15460183ed61SJeremy L Thompson     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
15477d8d0e25Snbeams 
15484b3e95d5SJeremy L Thompson     // ---- Restriction
15490183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
15500183ed61SJeremy L Thompson                                                                max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1551b7453713SJeremy L Thompson 
15524b3e95d5SJeremy L Thompson     // ---- Basis action
15530183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
15540183ed61SJeremy L Thompson                                                          is_all_tensor, is_at_points, use_3d_slices));
15557d8d0e25Snbeams   }
15567d8d0e25Snbeams 
15574b3e95d5SJeremy L Thompson   // -- Q function
15580183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
15590183ed61SJeremy L Thompson                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
15600183ed61SJeremy L Thompson                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
15617d8d0e25Snbeams 
15624b3e95d5SJeremy L Thompson   // -- Output basis and restriction
15630183ed61SJeremy L Thompson   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
15649e201c85SYohann   for (CeedInt i = 0; i < num_output_fields; i++) {
156559fa3f92SJeremy L Thompson     const char *field_name;
156659fa3f92SJeremy L Thompson 
156759fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
15680183ed61SJeremy L Thompson     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1569b7453713SJeremy L Thompson 
15704b3e95d5SJeremy L Thompson     // ---- Basis action
15710183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
15720183ed61SJeremy L Thompson                                                          is_all_tensor, is_at_points, use_3d_slices));
15737d8d0e25Snbeams 
15744b3e95d5SJeremy L Thompson     // ---- Restriction
15750183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
15760183ed61SJeremy L Thompson                                                                false, is_all_tensor, is_at_points, use_3d_slices));
15777d8d0e25Snbeams   }
15787d8d0e25Snbeams 
15794b3e95d5SJeremy L Thompson   // Close loop and function
15800183ed61SJeremy L Thompson   tab.pop();
15810183ed61SJeremy L Thompson   code << tab << "}\n";
15820183ed61SJeremy L Thompson   tab.pop();
15830183ed61SJeremy L Thompson   code << tab << "}\n";
15840183ed61SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n\n";
15857d8d0e25Snbeams 
1586539ec17dSJeremy L Thompson   CeedInt block_sizes[3] = {0, 0, 0};
15879e201c85SYohann   CeedInt num_elem;
1588b7453713SJeremy L Thompson 
15893a2968d6SJeremy L Thompson   // Compile
15902b730f8bSJeremy L Thompson   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
159174398b5aSJeremy L Thompson   CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
159290c30374SJeremy L Thompson   if (is_at_points) block_sizes[2] = 1;
15938d12f40eSJeremy L Thompson   {
15948d12f40eSJeremy L Thompson     bool is_compile_good = false;
15958d12f40eSJeremy L Thompson 
1596a61b1c91SJeremy L Thompson     data->thread_1d = block_sizes[0];
15976b92dc4bSJeremy L Thompson     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE",
15982b730f8bSJeremy L Thompson                                        block_sizes[0] * block_sizes[1] * block_sizes[2]));
15998d12f40eSJeremy L Thompson     if (is_compile_good) {
16008d12f40eSJeremy L Thompson       *is_good_build = true;
1601eb7e6cafSJeremy L Thompson       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
16028d12f40eSJeremy L Thompson     } else {
16038d12f40eSJeremy L Thompson       *is_good_build     = false;
16048d12f40eSJeremy L Thompson       data->use_fallback = true;
16058d12f40eSJeremy L Thompson     }
16068d12f40eSJeremy L Thompson   }
16072b730f8bSJeremy L Thompson   CeedCallBackend(CeedOperatorSetSetupDone(op));
16089bc66399SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
1609c11e12f4SJeremy L Thompson   CeedCallBackend(CeedQFunctionDestroy(&qf));
1610e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
16117d8d0e25Snbeams }
16122a86cc9dSSebastian Grimberg 
16137d8d0e25Snbeams //------------------------------------------------------------------------------
16140183ed61SJeremy L Thompson // Build AtPoints assembly operator kernel
16150183ed61SJeremy L Thompson //------------------------------------------------------------------------------
16160183ed61SJeremy L Thompson static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) {
16170183ed61SJeremy L Thompson   bool                   is_all_tensor = true, is_at_points = false, use_3d_slices = false;
16180183ed61SJeremy L Thompson   Ceed                   ceed;
16190183ed61SJeremy L Thompson   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
16200183ed61SJeremy L Thompson   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
16210183ed61SJeremy L Thompson   CeedQFunction_Hip_gen *qf_data;
16220183ed61SJeremy L Thompson   CeedQFunction          qf;
16230183ed61SJeremy L Thompson   CeedOperatorField     *op_input_fields, *op_output_fields;
16240183ed61SJeremy L Thompson   CeedOperator_Hip_gen  *data;
16250183ed61SJeremy L Thompson   std::ostringstream     code;
16260183ed61SJeremy L Thompson   Tab                    tab;
16270183ed61SJeremy L Thompson 
16280183ed61SJeremy L Thompson   // Check compatibility
16290183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
16300183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
16310183ed61SJeremy L Thompson   CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
16320183ed61SJeremy L Thompson 
16330183ed61SJeremy L Thompson   // Retrieve operator data
16340183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetData(op, &data));
16350183ed61SJeremy L Thompson   Q       = data->Q;
16360183ed61SJeremy L Thompson   Q_1d    = data->Q_1d;
16370183ed61SJeremy L Thompson   max_dim = data->dim;
16380183ed61SJeremy L Thompson   {
16390183ed61SJeremy L Thompson     CeedElemRestriction rstr_points = NULL;
16400183ed61SJeremy L Thompson 
16410183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
16420183ed61SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
16430183ed61SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
16440183ed61SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
16450183ed61SJeremy L Thompson   }
16460183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
16470183ed61SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
16480183ed61SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
16490183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
16500183ed61SJeremy L Thompson 
16510183ed61SJeremy L Thompson   // Load basis source files
16520183ed61SJeremy L Thompson   code << tab << "// Tensor basis source\n";
16530183ed61SJeremy L Thompson   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
16540183ed61SJeremy L Thompson   code << tab << "// AtPoints basis source\n";
16550183ed61SJeremy L Thompson   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
16560183ed61SJeremy L Thompson   code << tab << "// CodeGen operator source\n";
16570183ed61SJeremy L Thompson   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
16580183ed61SJeremy L Thompson 
16590183ed61SJeremy L Thompson   // Get QFunction name
16600183ed61SJeremy L Thompson   std::string qfunction_name(qf_data->qfunction_name);
16610183ed61SJeremy L Thompson   std::string operator_name;
16620183ed61SJeremy L Thompson 
16630183ed61SJeremy L Thompson   if (is_full) {
16640183ed61SJeremy L Thompson     operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name;
16650183ed61SJeremy L Thompson   } else {
16660183ed61SJeremy L Thompson     operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name;
16670183ed61SJeremy L Thompson   }
16680183ed61SJeremy L Thompson 
16690183ed61SJeremy L Thompson   // Define CEED_Q_VLA
16700183ed61SJeremy L Thompson   code << "\n" << tab << "#undef CEED_Q_VLA\n";
16710183ed61SJeremy L Thompson   code << tab << "#define CEED_Q_VLA 1\n\n";
16720183ed61SJeremy L Thompson 
16730183ed61SJeremy L Thompson   // Add user QFunction source
16740183ed61SJeremy L Thompson   {
16750183ed61SJeremy L Thompson     const char *source_path;
16760183ed61SJeremy L Thompson 
16770183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
16780183ed61SJeremy L Thompson     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
16790183ed61SJeremy L Thompson 
16800183ed61SJeremy L Thompson     code << tab << "// User QFunction source\n";
16810183ed61SJeremy L Thompson     code << tab << "#include \"" << source_path << "\"\n\n";
16820183ed61SJeremy L Thompson   }
16830183ed61SJeremy L Thompson 
16840183ed61SJeremy L Thompson   // Setup
16850183ed61SJeremy L Thompson   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
16860183ed61SJeremy L Thompson   code << tab << "// Operator Assembly Kernel\n";
16870183ed61SJeremy L Thompson   code << tab << "// \n";
16880183ed61SJeremy L Thompson   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
16890183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_e_i: Element vector register\n";
16900183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
16910183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
16920183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
16930183ed61SJeremy L Thompson   code << tab << "// \n";
16940183ed61SJeremy L Thompson   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
16950183ed61SJeremy L Thompson   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
16960183ed61SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n";
16970183ed61SJeremy L Thompson   code << tab << "extern \"C\" __global__ void " << operator_name
16980183ed61SJeremy L Thompson        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
16990183ed61SJeremy L Thompson           "points, CeedScalar *__restrict__ values_array) {\n";
17000183ed61SJeremy L Thompson   tab.push();
17010183ed61SJeremy L Thompson 
17020183ed61SJeremy L Thompson   // Scratch buffers
17030183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
17040183ed61SJeremy L Thompson     CeedEvalMode eval_mode;
17050183ed61SJeremy L Thompson 
17060183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
17070183ed61SJeremy L Thompson     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
17080183ed61SJeremy L Thompson       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
17090183ed61SJeremy L Thompson     }
17100183ed61SJeremy L Thompson   }
17110183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
17120183ed61SJeremy L Thompson     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
17130183ed61SJeremy L Thompson   }
17140183ed61SJeremy L Thompson 
17150183ed61SJeremy L Thompson   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
17160183ed61SJeremy L Thompson   code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
17170183ed61SJeremy L Thompson   code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
17180183ed61SJeremy L Thompson   code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
17190183ed61SJeremy L Thompson 
17200183ed61SJeremy L Thompson   // Shared data
17210183ed61SJeremy L Thompson   code << tab << "extern __shared__ CeedScalar slice[];\n";
17220183ed61SJeremy L Thompson   code << tab << "SharedData_Hip data;\n";
17230183ed61SJeremy L Thompson   code << tab << "data.t_id_x = threadIdx.x;\n";
17240183ed61SJeremy L Thompson   code << tab << "data.t_id_y = threadIdx.y;\n";
17250183ed61SJeremy L Thompson   code << tab << "data.t_id_z = threadIdx.z;\n";
17260183ed61SJeremy L Thompson   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
17270183ed61SJeremy L Thompson   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
17280183ed61SJeremy L Thompson 
17290183ed61SJeremy L Thompson   // -- Determine input mat reuse
17300183ed61SJeremy L Thompson   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
17310183ed61SJeremy L Thompson 
17320183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
17330183ed61SJeremy L Thompson     input_matrix_reuse[i].index = -1;
17340183ed61SJeremy L Thompson   }
17350183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
17360183ed61SJeremy L Thompson     CeedEvalMode eval_mode_i;
17370183ed61SJeremy L Thompson     CeedBasis    basis_i;
17380183ed61SJeremy L Thompson 
17390183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
17400183ed61SJeremy L Thompson     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
17410183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
17420183ed61SJeremy L Thompson     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
17430183ed61SJeremy L Thompson       CeedEvalMode eval_mode_j;
17440183ed61SJeremy L Thompson       CeedBasis    basis_j;
17450183ed61SJeremy L Thompson 
17460183ed61SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
17470183ed61SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
17480183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
17490183ed61SJeremy L Thompson       if (basis_i == basis_j) {
17500183ed61SJeremy L Thompson         input_matrix_reuse[i].index     = j;
17510183ed61SJeremy L Thompson         input_matrix_reuse[i].is_input  = true;
17520183ed61SJeremy L Thompson         input_matrix_reuse[i].eval_mode = eval_mode_j;
17530183ed61SJeremy L Thompson       }
17540183ed61SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
17550183ed61SJeremy L Thompson     }
17560183ed61SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
17570183ed61SJeremy L Thompson   }
17580183ed61SJeremy L Thompson 
17590183ed61SJeremy L Thompson   // -- Determine output mat reuse
17600183ed61SJeremy L Thompson   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
17610183ed61SJeremy L Thompson 
17620183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
17630183ed61SJeremy L Thompson     output_matrix_reuse[i].index = -1;
17640183ed61SJeremy L Thompson   }
17650183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
17660183ed61SJeremy L Thompson     CeedEvalMode eval_mode_i;
17670183ed61SJeremy L Thompson     CeedBasis    basis_i;
17680183ed61SJeremy L Thompson 
17690183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
17700183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
17710183ed61SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
17720183ed61SJeremy L Thompson       CeedEvalMode eval_mode_j;
17730183ed61SJeremy L Thompson       CeedBasis    basis_j;
17740183ed61SJeremy L Thompson 
17750183ed61SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
17760183ed61SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
17770183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
17780183ed61SJeremy L Thompson       if (basis_i == basis_j) {
17790183ed61SJeremy L Thompson         output_matrix_reuse[i].index     = j;
17800183ed61SJeremy L Thompson         output_matrix_reuse[i].is_input  = true;
17810183ed61SJeremy L Thompson         output_matrix_reuse[i].eval_mode = eval_mode_j;
17820183ed61SJeremy L Thompson       }
17830183ed61SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
17840183ed61SJeremy L Thompson     }
17850183ed61SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
17860183ed61SJeremy L Thompson       CeedEvalMode eval_mode_j;
17870183ed61SJeremy L Thompson       CeedBasis    basis_j;
17880183ed61SJeremy L Thompson 
17890183ed61SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
17900183ed61SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
17910183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
17920183ed61SJeremy L Thompson       if (basis_i == basis_j) {
17930183ed61SJeremy L Thompson         output_matrix_reuse[i].index     = j;
17940183ed61SJeremy L Thompson         output_matrix_reuse[i].is_input  = false;
17950183ed61SJeremy L Thompson         output_matrix_reuse[i].eval_mode = eval_mode_j;
17960183ed61SJeremy L Thompson       }
17970183ed61SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
17980183ed61SJeremy L Thompson     }
17990183ed61SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
18000183ed61SJeremy L Thompson   }
18010183ed61SJeremy L Thompson 
18020183ed61SJeremy L Thompson   // Initialize constants, and matrices B and G
18030183ed61SJeremy L Thompson   code << "\n" << tab << "// Input field constants and basis data\n";
18040183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
18050183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
18060183ed61SJeremy L Thompson                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
18070183ed61SJeremy L Thompson   }
18080183ed61SJeremy L Thompson   code << "\n" << tab << "// Output field constants and basis data\n";
18090183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
18100183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
18110183ed61SJeremy L Thompson                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
18120183ed61SJeremy L Thompson   }
18130183ed61SJeremy L Thompson 
18140183ed61SJeremy L Thompson   // Loop over all elements
18150183ed61SJeremy L Thompson   code << "\n" << tab << "// Element loop\n";
18160183ed61SJeremy L Thompson   code << tab << "__syncthreads();\n";
18170183ed61SJeremy L Thompson   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
18180183ed61SJeremy L Thompson   tab.push();
18190183ed61SJeremy L Thompson 
18200183ed61SJeremy L Thompson   // -- Compute minimum buffer space needed
18210183ed61SJeremy L Thompson   CeedInt max_rstr_buffer_size = 1;
18220183ed61SJeremy L Thompson 
18230183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
18240183ed61SJeremy L Thompson     CeedEvalMode eval_mode;
18250183ed61SJeremy L Thompson 
18260183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
18270183ed61SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
18280183ed61SJeremy L Thompson       CeedInt             num_comp;
18290183ed61SJeremy L Thompson       CeedElemRestriction elem_rstr;
18300183ed61SJeremy L Thompson 
18310183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
18320183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
18330183ed61SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
18340183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
18350183ed61SJeremy L Thompson     }
18360183ed61SJeremy L Thompson   }
18370183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
18380183ed61SJeremy L Thompson     CeedEvalMode eval_mode;
18390183ed61SJeremy L Thompson 
18400183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
18410183ed61SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE) {
18420183ed61SJeremy L Thompson       CeedInt             num_comp;
18430183ed61SJeremy L Thompson       CeedElemRestriction elem_rstr;
18440183ed61SJeremy L Thompson 
18450183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
18460183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
18470183ed61SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
18480183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
18490183ed61SJeremy L Thompson     }
18500183ed61SJeremy L Thompson   }
18510183ed61SJeremy L Thompson   code << tab << "// Scratch restriction buffer space\n";
18520183ed61SJeremy L Thompson   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
18530183ed61SJeremy L Thompson 
18540183ed61SJeremy L Thompson   // -- Determine best input field processing order
18550183ed61SJeremy L Thompson   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
18560183ed61SJeremy L Thompson 
18570183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
18580183ed61SJeremy L Thompson     field_rstr_in_buffer[i] = -1;
18590183ed61SJeremy L Thompson     input_field_order[i]    = -1;
18600183ed61SJeremy L Thompson   }
18610183ed61SJeremy L Thompson   {
18620183ed61SJeremy L Thompson     bool    is_ordered[CEED_FIELD_MAX];
18630183ed61SJeremy L Thompson     CeedInt curr_index = 0;
18640183ed61SJeremy L Thompson 
18650183ed61SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
18660183ed61SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
18670183ed61SJeremy L Thompson       CeedVector          vec_i;
18680183ed61SJeremy L Thompson       CeedElemRestriction rstr_i;
18690183ed61SJeremy L Thompson 
18700183ed61SJeremy L Thompson       if (is_ordered[i]) continue;
18710183ed61SJeremy L Thompson       field_rstr_in_buffer[i]       = i;
18720183ed61SJeremy L Thompson       is_ordered[i]                 = true;
18730183ed61SJeremy L Thompson       input_field_order[curr_index] = i;
18740183ed61SJeremy L Thompson       curr_index++;
18750183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
18760183ed61SJeremy L Thompson       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
18770183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
18780183ed61SJeremy L Thompson       for (CeedInt j = i + 1; j < num_input_fields; j++) {
18790183ed61SJeremy L Thompson         CeedVector          vec_j;
18800183ed61SJeremy L Thompson         CeedElemRestriction rstr_j;
18810183ed61SJeremy L Thompson 
18820183ed61SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
18830183ed61SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
18840183ed61SJeremy L Thompson         if (rstr_i == rstr_j && vec_i == vec_j) {
18850183ed61SJeremy L Thompson           field_rstr_in_buffer[j]       = i;
18860183ed61SJeremy L Thompson           is_ordered[j]                 = true;
18870183ed61SJeremy L Thompson           input_field_order[curr_index] = j;
18880183ed61SJeremy L Thompson           curr_index++;
18890183ed61SJeremy L Thompson         }
18900183ed61SJeremy L Thompson         CeedCallBackend(CeedVectorDestroy(&vec_j));
18910183ed61SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
18920183ed61SJeremy L Thompson       }
18930183ed61SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec_i));
18940183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
18950183ed61SJeremy L Thompson     }
18960183ed61SJeremy L Thompson   }
18970183ed61SJeremy L Thompson 
18980183ed61SJeremy L Thompson   // -- Input restriction and basis
18990183ed61SJeremy L Thompson   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
19000183ed61SJeremy L Thompson   CeedInt active_field_index = -1;
19010183ed61SJeremy L Thompson 
19020183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
19030183ed61SJeremy L Thompson     bool          is_active = false;
19040183ed61SJeremy L Thompson     const char   *field_name;
19050183ed61SJeremy L Thompson     const CeedInt f = input_field_order[i];
19060183ed61SJeremy L Thompson 
19070183ed61SJeremy L Thompson     {
19080183ed61SJeremy L Thompson       CeedVector vec;
19090183ed61SJeremy L Thompson 
19100183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
19110183ed61SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
19120183ed61SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
19130183ed61SJeremy L Thompson     }
19140183ed61SJeremy L Thompson 
19150183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
19160183ed61SJeremy L Thompson     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
19170183ed61SJeremy L Thompson 
19180183ed61SJeremy L Thompson     if (is_active) {
19190183ed61SJeremy L Thompson       std::string var_suffix = "_in_" + std::to_string(f);
19200183ed61SJeremy L Thompson 
19210183ed61SJeremy L Thompson       code << tab << "// Active field - no restriction or basis action here\n";
19220183ed61SJeremy L Thompson       if (active_field_index == -1) {
19230183ed61SJeremy L Thompson         active_field_index = f;
19240183ed61SJeremy L Thompson         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
19250183ed61SJeremy L Thompson              << "] = {0.0};\n";
19260183ed61SJeremy L Thompson       } else {
19270183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
19280183ed61SJeremy L Thompson       }
19290183ed61SJeremy L Thompson     } else {
19300183ed61SJeremy L Thompson       // ---- Restriction
19310183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
19320183ed61SJeremy L Thompson                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
19330183ed61SJeremy L Thompson 
19340183ed61SJeremy L Thompson       // ---- Basis action
19350183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
19360183ed61SJeremy L Thompson                                                            is_all_tensor, is_at_points, use_3d_slices));
19370183ed61SJeremy L Thompson     }
19380183ed61SJeremy L Thompson   }
19390183ed61SJeremy L Thompson 
19400183ed61SJeremy L Thompson   // -- Loop over active field
19410183ed61SJeremy L Thompson   std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
19420183ed61SJeremy L Thompson 
19430183ed61SJeremy L Thompson   code << "\n" << tab << "// Loop over nodes in active field\n";
19440183ed61SJeremy L Thompson   code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
19450183ed61SJeremy L Thompson        << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
19460183ed61SJeremy L Thompson   tab.push();
19470183ed61SJeremy L Thompson 
19480183ed61SJeremy L Thompson   // -- Set current active node and component to 1
19490183ed61SJeremy L Thompson   code << tab << "// Set current active node and component to 1.0\n";
19500183ed61SJeremy L Thompson   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
19510183ed61SJeremy L Thompson        << active_var_suffix << ");\n\n";
19520183ed61SJeremy L Thompson 
19530183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
19540183ed61SJeremy L Thompson     bool          is_active = false;
19550183ed61SJeremy L Thompson     const char   *field_name;
19560183ed61SJeremy L Thompson     const CeedInt f = input_field_order[i];
19570183ed61SJeremy L Thompson 
19580183ed61SJeremy L Thompson     {
19590183ed61SJeremy L Thompson       CeedVector vec;
19600183ed61SJeremy L Thompson 
19610183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
19620183ed61SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
19630183ed61SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
19640183ed61SJeremy L Thompson     }
19650183ed61SJeremy L Thompson     if (!is_active) continue;
19660183ed61SJeremy L Thompson 
19670183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
19680183ed61SJeremy L Thompson     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
19690183ed61SJeremy L Thompson 
19700183ed61SJeremy L Thompson     // ---- Basis action
19710183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
19720183ed61SJeremy L Thompson                                                          is_all_tensor, is_at_points, use_3d_slices));
19730183ed61SJeremy L Thompson   }
19740183ed61SJeremy L Thompson 
19750183ed61SJeremy L Thompson   // -- Q function
19760183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
19770183ed61SJeremy L Thompson                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
19780183ed61SJeremy L Thompson                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
19790183ed61SJeremy L Thompson 
19800183ed61SJeremy L Thompson   // -- Output basis and restriction
19810183ed61SJeremy L Thompson   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
19820183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
19830183ed61SJeremy L Thompson     bool        is_active = false;
19840183ed61SJeremy L Thompson     const char *field_name;
19850183ed61SJeremy L Thompson 
19860183ed61SJeremy L Thompson     {
19870183ed61SJeremy L Thompson       CeedVector vec;
19880183ed61SJeremy L Thompson 
19890183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
19900183ed61SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
19910183ed61SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
19920183ed61SJeremy L Thompson     }
19930183ed61SJeremy L Thompson     if (!is_active) continue;
19940183ed61SJeremy L Thompson 
19950183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
19960183ed61SJeremy L Thompson     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
19970183ed61SJeremy L Thompson 
19980183ed61SJeremy L Thompson     // ---- Basis action
19990183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
20000183ed61SJeremy L Thompson                                                          is_all_tensor, is_at_points, use_3d_slices));
20010183ed61SJeremy L Thompson 
20020183ed61SJeremy L Thompson     // ---- Restriction
20030183ed61SJeremy L Thompson     if (is_full) {
2004692716b7SZach Atkins       std::string         var_suffix = "_out_" + std::to_string(i);
2005692716b7SZach Atkins       CeedInt             comp_stride;
2006692716b7SZach Atkins       CeedSize            l_size;
2007692716b7SZach Atkins       CeedElemRestriction elem_rstr;
2008692716b7SZach Atkins 
2009692716b7SZach Atkins       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2010692716b7SZach Atkins       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2011692716b7SZach Atkins       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2012692716b7SZach Atkins       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2013692716b7SZach Atkins       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2014692716b7SZach Atkins       code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2015692716b7SZach Atkins            << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
2016692716b7SZach Atkins       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
20170183ed61SJeremy L Thompson     } else {
20180183ed61SJeremy L Thompson       std::string         var_suffix = "_out_" + std::to_string(i);
20190183ed61SJeremy L Thompson       CeedInt             comp_stride;
20200183ed61SJeremy L Thompson       CeedSize            l_size;
20210183ed61SJeremy L Thompson       CeedElemRestriction elem_rstr;
20220183ed61SJeremy L Thompson 
20230183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
20240183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
20250183ed61SJeremy L Thompson       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
20260183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
20270183ed61SJeremy L Thompson       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
20280183ed61SJeremy L Thompson       code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
20290183ed61SJeremy L Thompson            << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
20300183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
20310183ed61SJeremy L Thompson     }
20320183ed61SJeremy L Thompson   }
20330183ed61SJeremy L Thompson 
20340183ed61SJeremy L Thompson   // -- Reset current active node and component
20350183ed61SJeremy L Thompson   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
20360183ed61SJeremy L Thompson   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
20370183ed61SJeremy L Thompson        << active_var_suffix << ");\n";
20380183ed61SJeremy L Thompson 
20390183ed61SJeremy L Thompson   // -- End of loop over active field
20400183ed61SJeremy L Thompson   tab.pop();
20410183ed61SJeremy L Thompson   code << tab << "}\n";
20420183ed61SJeremy L Thompson 
20430183ed61SJeremy L Thompson   // Close loop and function
20440183ed61SJeremy L Thompson   tab.pop();
20450183ed61SJeremy L Thompson   code << tab << "}\n";
20460183ed61SJeremy L Thompson   tab.pop();
20470183ed61SJeremy L Thompson   code << tab << "}\n";
20480183ed61SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n\n";
20490183ed61SJeremy L Thompson 
20500183ed61SJeremy L Thompson   CeedInt block_sizes[3] = {0, 0, 0};
20510183ed61SJeremy L Thompson   CeedInt num_elem;
20520183ed61SJeremy L Thompson 
20530183ed61SJeremy L Thompson   // Compile
20540183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
20550183ed61SJeremy L Thompson   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
20560183ed61SJeremy L Thompson   block_sizes[2] = 1;
20570183ed61SJeremy L Thompson   {
20580183ed61SJeremy L Thompson     bool is_compile_good = false;
20590183ed61SJeremy L Thompson 
20600183ed61SJeremy L Thompson     data->thread_1d = block_sizes[0];
20610183ed61SJeremy L Thompson     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good,
20620183ed61SJeremy L Thompson                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0],
20630183ed61SJeremy L Thompson                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
20640183ed61SJeremy L Thompson     if (is_compile_good) {
20650183ed61SJeremy L Thompson       *is_good_build = true;
20660183ed61SJeremy L Thompson       CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
20670183ed61SJeremy L Thompson                                         is_full ? &data->assemble_full : &data->assemble_diagonal));
20680183ed61SJeremy L Thompson     } else {
20690183ed61SJeremy L Thompson       *is_good_build              = false;
20700183ed61SJeremy L Thompson       data->use_assembly_fallback = true;
20710183ed61SJeremy L Thompson     }
20720183ed61SJeremy L Thompson   }
20730183ed61SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
20740183ed61SJeremy L Thompson   CeedCallBackend(CeedQFunctionDestroy(&qf));
20750183ed61SJeremy L Thompson   return CEED_ERROR_SUCCESS;
20760183ed61SJeremy L Thompson }
20770183ed61SJeremy L Thompson 
20780183ed61SJeremy L Thompson extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
20790183ed61SJeremy L Thompson   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build);
20800183ed61SJeremy L Thompson }
20810183ed61SJeremy L Thompson 
2082692716b7SZach Atkins extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2083692716b7SZach Atkins   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build);
2084692716b7SZach Atkins }
2085*5daefc96SJeremy L Thompson //------------------------------------------------------------------------------
2086*5daefc96SJeremy L Thompson // Build QFunction assembly operator kernel
2087*5daefc96SJeremy L Thompson //------------------------------------------------------------------------------
2088*5daefc96SJeremy L Thompson extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) {
2089*5daefc96SJeremy L Thompson   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
2090*5daefc96SJeremy L Thompson   Ceed                   ceed;
2091*5daefc96SJeremy L Thompson   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0;
2092*5daefc96SJeremy L Thompson   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
2093*5daefc96SJeremy L Thompson   CeedQFunction_Hip_gen *qf_data;
2094*5daefc96SJeremy L Thompson   CeedQFunction          qf;
2095*5daefc96SJeremy L Thompson   CeedOperatorField     *op_input_fields, *op_output_fields;
2096*5daefc96SJeremy L Thompson   CeedOperator_Hip_gen  *data;
2097*5daefc96SJeremy L Thompson   std::ostringstream     code;
2098*5daefc96SJeremy L Thompson   Tab                    tab;
2099*5daefc96SJeremy L Thompson 
2100*5daefc96SJeremy L Thompson   // Check compatibility
2101*5daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
2102*5daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
2103*5daefc96SJeremy L Thompson   CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported");
2104*5daefc96SJeremy L Thompson 
2105*5daefc96SJeremy L Thompson   // Check field compatibility
2106*5daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
2107*5daefc96SJeremy L Thompson   {
2108*5daefc96SJeremy L Thompson     bool has_shared_bases = true;
2109*5daefc96SJeremy L Thompson 
2110*5daefc96SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
2111*5daefc96SJeremy L Thompson       CeedBasis basis;
2112*5daefc96SJeremy L Thompson 
2113*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
2114*5daefc96SJeremy L Thompson       if (basis != CEED_BASIS_NONE) {
2115*5daefc96SJeremy L Thompson         bool        is_tensor = true;
2116*5daefc96SJeremy L Thompson         const char *resource;
2117*5daefc96SJeremy L Thompson         char       *resource_root;
2118*5daefc96SJeremy L Thompson         Ceed        basis_ceed;
2119*5daefc96SJeremy L Thompson 
2120*5daefc96SJeremy L Thompson         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2121*5daefc96SJeremy L Thompson         is_all_tensor    = is_all_tensor && is_tensor;
2122*5daefc96SJeremy L Thompson         is_all_nontensor = is_all_nontensor && !is_tensor;
2123*5daefc96SJeremy L Thompson         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2124*5daefc96SJeremy L Thompson         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2125*5daefc96SJeremy L Thompson         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2126*5daefc96SJeremy L Thompson         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2127*5daefc96SJeremy L Thompson         CeedCallBackend(CeedFree(&resource_root));
2128*5daefc96SJeremy L Thompson         CeedCallBackend(CeedDestroy(&basis_ceed));
2129*5daefc96SJeremy L Thompson       }
2130*5daefc96SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis));
2131*5daefc96SJeremy L Thompson     }
2132*5daefc96SJeremy L Thompson 
2133*5daefc96SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
2134*5daefc96SJeremy L Thompson       CeedBasis basis;
2135*5daefc96SJeremy L Thompson 
2136*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
2137*5daefc96SJeremy L Thompson       if (basis != CEED_BASIS_NONE) {
2138*5daefc96SJeremy L Thompson         bool        is_tensor = true;
2139*5daefc96SJeremy L Thompson         const char *resource;
2140*5daefc96SJeremy L Thompson         char       *resource_root;
2141*5daefc96SJeremy L Thompson         Ceed        basis_ceed;
2142*5daefc96SJeremy L Thompson 
2143*5daefc96SJeremy L Thompson         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
2144*5daefc96SJeremy L Thompson         is_all_tensor    = is_all_tensor && is_tensor;
2145*5daefc96SJeremy L Thompson         is_all_nontensor = is_all_nontensor && !is_tensor;
2146*5daefc96SJeremy L Thompson 
2147*5daefc96SJeremy L Thompson         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
2148*5daefc96SJeremy L Thompson         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
2149*5daefc96SJeremy L Thompson         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
2150*5daefc96SJeremy L Thompson         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
2151*5daefc96SJeremy L Thompson         CeedCallBackend(CeedFree(&resource_root));
2152*5daefc96SJeremy L Thompson         CeedCallBackend(CeedDestroy(&basis_ceed));
2153*5daefc96SJeremy L Thompson       }
2154*5daefc96SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis));
2155*5daefc96SJeremy L Thompson     }
2156*5daefc96SJeremy L Thompson   }
2157*5daefc96SJeremy L Thompson 
2158*5daefc96SJeremy L Thompson   // Retrieve operator data
2159*5daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorGetData(op, &data));
2160*5daefc96SJeremy L Thompson   Q       = data->Q;
2161*5daefc96SJeremy L Thompson   Q_1d    = data->Q_1d;
2162*5daefc96SJeremy L Thompson   max_dim = data->dim;
2163*5daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
2164*5daefc96SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
2165*5daefc96SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
2166*5daefc96SJeremy L Thompson 
2167*5daefc96SJeremy L Thompson   // Load basis source files
2168*5daefc96SJeremy L Thompson   if (!is_all_nontensor) {
2169*5daefc96SJeremy L Thompson     code << tab << "// Tensor basis source\n";
2170*5daefc96SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
2171*5daefc96SJeremy L Thompson   }
2172*5daefc96SJeremy L Thompson   if (!is_all_tensor) {
2173*5daefc96SJeremy L Thompson     code << tab << "// Non-tensor basis source\n";
2174*5daefc96SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
2175*5daefc96SJeremy L Thompson   }
2176*5daefc96SJeremy L Thompson   if (!is_all_tensor && !is_all_nontensor) {
2177*5daefc96SJeremy L Thompson     code << "// Tensor basis source\n";
2178*5daefc96SJeremy L Thompson     code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
2179*5daefc96SJeremy L Thompson   }
2180*5daefc96SJeremy L Thompson   code << "// CodeGen operator source\n";
2181*5daefc96SJeremy L Thompson   code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
2182*5daefc96SJeremy L Thompson 
2183*5daefc96SJeremy L Thompson   // Get QFunction name
2184*5daefc96SJeremy L Thompson   std::string qfunction_name(qf_data->qfunction_name);
2185*5daefc96SJeremy L Thompson   std::string operator_name;
2186*5daefc96SJeremy L Thompson 
2187*5daefc96SJeremy L Thompson   operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name;
2188*5daefc96SJeremy L Thompson 
2189*5daefc96SJeremy L Thompson   // Define CEED_Q_VLA
2190*5daefc96SJeremy L Thompson   code << "\n" << tab << "#undef CEED_Q_VLA\n";
2191*5daefc96SJeremy L Thompson   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
2192*5daefc96SJeremy L Thompson     code << tab << "#define CEED_Q_VLA 1\n\n";
2193*5daefc96SJeremy L Thompson   } else {
2194*5daefc96SJeremy L Thompson     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
2195*5daefc96SJeremy L Thompson   }
2196*5daefc96SJeremy L Thompson 
2197*5daefc96SJeremy L Thompson   // Add user QFunction source
2198*5daefc96SJeremy L Thompson   {
2199*5daefc96SJeremy L Thompson     const char *source_path;
2200*5daefc96SJeremy L Thompson 
2201*5daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
2202*5daefc96SJeremy L Thompson     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
2203*5daefc96SJeremy L Thompson 
2204*5daefc96SJeremy L Thompson     code << tab << "// User QFunction source\n";
2205*5daefc96SJeremy L Thompson     code << tab << "#include \"" << source_path << "\"\n\n";
2206*5daefc96SJeremy L Thompson   }
2207*5daefc96SJeremy L Thompson 
2208*5daefc96SJeremy L Thompson   // Setup
2209*5daefc96SJeremy L Thompson   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
2210*5daefc96SJeremy L Thompson   code << tab << "// Operator Assembly Kernel\n";
2211*5daefc96SJeremy L Thompson   code << tab << "// \n";
2212*5daefc96SJeremy L Thompson   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
2213*5daefc96SJeremy L Thompson   code << tab << "// r_[in,out]_e_i: Element vector register\n";
2214*5daefc96SJeremy L Thompson   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
2215*5daefc96SJeremy L Thompson   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
2216*5daefc96SJeremy L Thompson   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
2217*5daefc96SJeremy L Thompson   code << tab << "// \n";
2218*5daefc96SJeremy L Thompson   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
2219*5daefc96SJeremy L Thompson   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
2220*5daefc96SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n";
2221*5daefc96SJeremy L Thompson   code << tab << "extern \"C\" __global__ void " << operator_name
2222*5daefc96SJeremy L Thompson        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
2223*5daefc96SJeremy L Thompson           "points, CeedScalar *__restrict__ values_array) {\n";
2224*5daefc96SJeremy L Thompson   tab.push();
2225*5daefc96SJeremy L Thompson 
2226*5daefc96SJeremy L Thompson   // Scratch buffers
2227*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
2228*5daefc96SJeremy L Thompson     CeedEvalMode eval_mode;
2229*5daefc96SJeremy L Thompson 
2230*5daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2231*5daefc96SJeremy L Thompson     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
2232*5daefc96SJeremy L Thompson       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
2233*5daefc96SJeremy L Thompson     }
2234*5daefc96SJeremy L Thompson   }
2235*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
2236*5daefc96SJeremy L Thompson     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
2237*5daefc96SJeremy L Thompson   }
2238*5daefc96SJeremy L Thompson 
2239*5daefc96SJeremy L Thompson   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
2240*5daefc96SJeremy L Thompson   if (!is_all_tensor) {
2241*5daefc96SJeremy L Thompson     code << tab << "const CeedInt Q = " << Q << ";\n";
2242*5daefc96SJeremy L Thompson   }
2243*5daefc96SJeremy L Thompson   if (!is_all_nontensor) {
2244*5daefc96SJeremy L Thompson     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
2245*5daefc96SJeremy L Thompson   }
2246*5daefc96SJeremy L Thompson 
2247*5daefc96SJeremy L Thompson   // Shared data
2248*5daefc96SJeremy L Thompson   code << tab << "extern __shared__ CeedScalar slice[];\n";
2249*5daefc96SJeremy L Thompson   code << tab << "SharedData_Hip data;\n";
2250*5daefc96SJeremy L Thompson   code << tab << "data.t_id_x = threadIdx.x;\n";
2251*5daefc96SJeremy L Thompson   code << tab << "data.t_id_y = threadIdx.y;\n";
2252*5daefc96SJeremy L Thompson   code << tab << "data.t_id_z = threadIdx.z;\n";
2253*5daefc96SJeremy L Thompson   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
2254*5daefc96SJeremy L Thompson   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
2255*5daefc96SJeremy L Thompson 
2256*5daefc96SJeremy L Thompson   // -- Determine input mat reuse
2257*5daefc96SJeremy L Thompson   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
2258*5daefc96SJeremy L Thompson 
2259*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
2260*5daefc96SJeremy L Thompson     input_matrix_reuse[i].index = -1;
2261*5daefc96SJeremy L Thompson   }
2262*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
2263*5daefc96SJeremy L Thompson     bool         is_tensor = true;
2264*5daefc96SJeremy L Thompson     CeedEvalMode eval_mode_i;
2265*5daefc96SJeremy L Thompson     CeedBasis    basis_i;
2266*5daefc96SJeremy L Thompson 
2267*5daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
2268*5daefc96SJeremy L Thompson     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
2269*5daefc96SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
2270*5daefc96SJeremy L Thompson     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2271*5daefc96SJeremy L Thompson     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
2272*5daefc96SJeremy L Thompson       CeedEvalMode eval_mode_j;
2273*5daefc96SJeremy L Thompson       CeedBasis    basis_j;
2274*5daefc96SJeremy L Thompson 
2275*5daefc96SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2276*5daefc96SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2277*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2278*5daefc96SJeremy L Thompson       if (basis_i == basis_j) {
2279*5daefc96SJeremy L Thompson         if (is_tensor) {
2280*5daefc96SJeremy L Thompson           input_matrix_reuse[i].index     = j;
2281*5daefc96SJeremy L Thompson           input_matrix_reuse[i].is_input  = true;
2282*5daefc96SJeremy L Thompson           input_matrix_reuse[i].eval_mode = eval_mode_j;
2283*5daefc96SJeremy L Thompson         } else {
2284*5daefc96SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
2285*5daefc96SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
2286*5daefc96SJeremy L Thompson             input_matrix_reuse[i].index     = j;
2287*5daefc96SJeremy L Thompson             input_matrix_reuse[i].is_input  = true;
2288*5daefc96SJeremy L Thompson             input_matrix_reuse[i].eval_mode = eval_mode_j;
2289*5daefc96SJeremy L Thompson           }
2290*5daefc96SJeremy L Thompson         }
2291*5daefc96SJeremy L Thompson       }
2292*5daefc96SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
2293*5daefc96SJeremy L Thompson     }
2294*5daefc96SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
2295*5daefc96SJeremy L Thompson   }
2296*5daefc96SJeremy L Thompson 
2297*5daefc96SJeremy L Thompson   // -- Determine output mat reuse
2298*5daefc96SJeremy L Thompson   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
2299*5daefc96SJeremy L Thompson 
2300*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
2301*5daefc96SJeremy L Thompson     output_matrix_reuse[i].index = -1;
2302*5daefc96SJeremy L Thompson   }
2303*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
2304*5daefc96SJeremy L Thompson     bool         is_tensor = true;
2305*5daefc96SJeremy L Thompson     CeedEvalMode eval_mode_i;
2306*5daefc96SJeremy L Thompson     CeedBasis    basis_i;
2307*5daefc96SJeremy L Thompson 
2308*5daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
2309*5daefc96SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
2310*5daefc96SJeremy L Thompson     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
2311*5daefc96SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
2312*5daefc96SJeremy L Thompson       CeedEvalMode eval_mode_j;
2313*5daefc96SJeremy L Thompson       CeedBasis    basis_j;
2314*5daefc96SJeremy L Thompson 
2315*5daefc96SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
2316*5daefc96SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2317*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
2318*5daefc96SJeremy L Thompson       if (basis_i == basis_j) {
2319*5daefc96SJeremy L Thompson         if (is_tensor) {
2320*5daefc96SJeremy L Thompson           output_matrix_reuse[i].index     = j;
2321*5daefc96SJeremy L Thompson           output_matrix_reuse[i].is_input  = true;
2322*5daefc96SJeremy L Thompson           output_matrix_reuse[i].eval_mode = eval_mode_j;
2323*5daefc96SJeremy L Thompson         } else {
2324*5daefc96SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
2325*5daefc96SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
2326*5daefc96SJeremy L Thompson             output_matrix_reuse[i].index     = j;
2327*5daefc96SJeremy L Thompson             output_matrix_reuse[i].is_input  = true;
2328*5daefc96SJeremy L Thompson             output_matrix_reuse[i].eval_mode = eval_mode_j;
2329*5daefc96SJeremy L Thompson           }
2330*5daefc96SJeremy L Thompson         }
2331*5daefc96SJeremy L Thompson       }
2332*5daefc96SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
2333*5daefc96SJeremy L Thompson     }
2334*5daefc96SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
2335*5daefc96SJeremy L Thompson       CeedEvalMode eval_mode_j;
2336*5daefc96SJeremy L Thompson       CeedBasis    basis_j;
2337*5daefc96SJeremy L Thompson 
2338*5daefc96SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
2339*5daefc96SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
2340*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
2341*5daefc96SJeremy L Thompson       if (basis_i == basis_j) {
2342*5daefc96SJeremy L Thompson         if (is_tensor) {
2343*5daefc96SJeremy L Thompson           output_matrix_reuse[i].index     = j;
2344*5daefc96SJeremy L Thompson           output_matrix_reuse[i].is_input  = false;
2345*5daefc96SJeremy L Thompson           output_matrix_reuse[i].eval_mode = eval_mode_j;
2346*5daefc96SJeremy L Thompson         } else {
2347*5daefc96SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
2348*5daefc96SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
2349*5daefc96SJeremy L Thompson             output_matrix_reuse[i].index     = j;
2350*5daefc96SJeremy L Thompson             output_matrix_reuse[i].is_input  = false;
2351*5daefc96SJeremy L Thompson             output_matrix_reuse[i].eval_mode = eval_mode_j;
2352*5daefc96SJeremy L Thompson           }
2353*5daefc96SJeremy L Thompson         }
2354*5daefc96SJeremy L Thompson       }
2355*5daefc96SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
2356*5daefc96SJeremy L Thompson     }
2357*5daefc96SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
2358*5daefc96SJeremy L Thompson   }
2359*5daefc96SJeremy L Thompson 
2360*5daefc96SJeremy L Thompson   // Initialize constants, and matrices B and G
2361*5daefc96SJeremy L Thompson   code << "\n" << tab << "// Input field constants and basis data\n";
2362*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
2363*5daefc96SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
2364*5daefc96SJeremy L Thompson                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
2365*5daefc96SJeremy L Thompson   }
2366*5daefc96SJeremy L Thompson   code << "\n" << tab << "// Output field constants and basis data\n";
2367*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
2368*5daefc96SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
2369*5daefc96SJeremy L Thompson                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
2370*5daefc96SJeremy L Thompson   }
2371*5daefc96SJeremy L Thompson 
2372*5daefc96SJeremy L Thompson   // Loop over all elements
2373*5daefc96SJeremy L Thompson   code << "\n" << tab << "// Element loop\n";
2374*5daefc96SJeremy L Thompson   code << tab << "__syncthreads();\n";
2375*5daefc96SJeremy L Thompson   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
2376*5daefc96SJeremy L Thompson   tab.push();
2377*5daefc96SJeremy L Thompson 
2378*5daefc96SJeremy L Thompson   // -- Compute minimum buffer space needed
2379*5daefc96SJeremy L Thompson   CeedInt max_rstr_buffer_size = 1;
2380*5daefc96SJeremy L Thompson 
2381*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
2382*5daefc96SJeremy L Thompson     CeedEvalMode eval_mode;
2383*5daefc96SJeremy L Thompson 
2384*5daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
2385*5daefc96SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
2386*5daefc96SJeremy L Thompson       CeedInt             num_comp;
2387*5daefc96SJeremy L Thompson       CeedElemRestriction elem_rstr;
2388*5daefc96SJeremy L Thompson 
2389*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
2390*5daefc96SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2391*5daefc96SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2392*5daefc96SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2393*5daefc96SJeremy L Thompson     }
2394*5daefc96SJeremy L Thompson   }
2395*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
2396*5daefc96SJeremy L Thompson     CeedEvalMode eval_mode;
2397*5daefc96SJeremy L Thompson 
2398*5daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
2399*5daefc96SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE) {
2400*5daefc96SJeremy L Thompson       CeedInt             num_comp;
2401*5daefc96SJeremy L Thompson       CeedElemRestriction elem_rstr;
2402*5daefc96SJeremy L Thompson 
2403*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2404*5daefc96SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2405*5daefc96SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
2406*5daefc96SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2407*5daefc96SJeremy L Thompson     }
2408*5daefc96SJeremy L Thompson   }
2409*5daefc96SJeremy L Thompson   code << tab << "// Scratch restriction buffer space\n";
2410*5daefc96SJeremy L Thompson   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
2411*5daefc96SJeremy L Thompson 
2412*5daefc96SJeremy L Thompson   // -- Determine best input field processing order
2413*5daefc96SJeremy L Thompson   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
2414*5daefc96SJeremy L Thompson 
2415*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
2416*5daefc96SJeremy L Thompson     field_rstr_in_buffer[i] = -1;
2417*5daefc96SJeremy L Thompson     input_field_order[i]    = -1;
2418*5daefc96SJeremy L Thompson   }
2419*5daefc96SJeremy L Thompson   {
2420*5daefc96SJeremy L Thompson     bool    is_ordered[CEED_FIELD_MAX];
2421*5daefc96SJeremy L Thompson     CeedInt curr_index = 0;
2422*5daefc96SJeremy L Thompson 
2423*5daefc96SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
2424*5daefc96SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
2425*5daefc96SJeremy L Thompson       CeedVector          vec_i;
2426*5daefc96SJeremy L Thompson       CeedElemRestriction rstr_i;
2427*5daefc96SJeremy L Thompson 
2428*5daefc96SJeremy L Thompson       if (is_ordered[i]) continue;
2429*5daefc96SJeremy L Thompson       field_rstr_in_buffer[i]       = i;
2430*5daefc96SJeremy L Thompson       is_ordered[i]                 = true;
2431*5daefc96SJeremy L Thompson       input_field_order[curr_index] = i;
2432*5daefc96SJeremy L Thompson       curr_index++;
2433*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
2434*5daefc96SJeremy L Thompson       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
2435*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
2436*5daefc96SJeremy L Thompson       for (CeedInt j = i + 1; j < num_input_fields; j++) {
2437*5daefc96SJeremy L Thompson         CeedVector          vec_j;
2438*5daefc96SJeremy L Thompson         CeedElemRestriction rstr_j;
2439*5daefc96SJeremy L Thompson 
2440*5daefc96SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
2441*5daefc96SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
2442*5daefc96SJeremy L Thompson         if (rstr_i == rstr_j && vec_i == vec_j) {
2443*5daefc96SJeremy L Thompson           field_rstr_in_buffer[j]       = i;
2444*5daefc96SJeremy L Thompson           is_ordered[j]                 = true;
2445*5daefc96SJeremy L Thompson           input_field_order[curr_index] = j;
2446*5daefc96SJeremy L Thompson           curr_index++;
2447*5daefc96SJeremy L Thompson         }
2448*5daefc96SJeremy L Thompson         CeedCallBackend(CeedVectorDestroy(&vec_j));
2449*5daefc96SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
2450*5daefc96SJeremy L Thompson       }
2451*5daefc96SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec_i));
2452*5daefc96SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
2453*5daefc96SJeremy L Thompson     }
2454*5daefc96SJeremy L Thompson   }
2455*5daefc96SJeremy L Thompson 
2456*5daefc96SJeremy L Thompson   // -- Input restriction and basis
2457*5daefc96SJeremy L Thompson   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
2458*5daefc96SJeremy L Thompson   CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0;
2459*5daefc96SJeremy L Thompson   CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX];
2460*5daefc96SJeremy L Thompson 
2461*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
2462*5daefc96SJeremy L Thompson     bool          is_active = false;
2463*5daefc96SJeremy L Thompson     const char   *field_name;
2464*5daefc96SJeremy L Thompson     const CeedInt f = input_field_order[i];
2465*5daefc96SJeremy L Thompson 
2466*5daefc96SJeremy L Thompson     {
2467*5daefc96SJeremy L Thompson       CeedVector vec;
2468*5daefc96SJeremy L Thompson 
2469*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
2470*5daefc96SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
2471*5daefc96SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
2472*5daefc96SJeremy L Thompson     }
2473*5daefc96SJeremy L Thompson 
2474*5daefc96SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
2475*5daefc96SJeremy L Thompson     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
2476*5daefc96SJeremy L Thompson 
2477*5daefc96SJeremy L Thompson     if (is_active) {
2478*5daefc96SJeremy L Thompson       CeedEvalMode eval_mode;
2479*5daefc96SJeremy L Thompson       CeedInt      field_size;
2480*5daefc96SJeremy L Thompson 
2481*5daefc96SJeremy L Thompson       active_fields_in[num_active_in] = f;
2482*5daefc96SJeremy L Thompson       num_active_in++;
2483*5daefc96SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
2484*5daefc96SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
2485*5daefc96SJeremy L Thompson       if (eval_mode == CEED_EVAL_GRAD) {
2486*5daefc96SJeremy L Thompson         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
2487*5daefc96SJeremy L Thompson              << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2488*5daefc96SJeremy L Thompson       } else {
2489*5daefc96SJeremy L Thompson         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
2490*5daefc96SJeremy L Thompson       }
2491*5daefc96SJeremy L Thompson       code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n";
2492*5daefc96SJeremy L Thompson     } else {
2493*5daefc96SJeremy L Thompson       // ---- Restriction
2494*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
2495*5daefc96SJeremy L Thompson                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
2496*5daefc96SJeremy L Thompson 
2497*5daefc96SJeremy L Thompson       // ---- Basis action
2498*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
2499*5daefc96SJeremy L Thompson                                                            is_all_tensor, is_at_points, use_3d_slices));
2500*5daefc96SJeremy L Thompson     }
2501*5daefc96SJeremy L Thompson   }
2502*5daefc96SJeremy L Thompson   code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {";
2503*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_active_in; i++) {
2504*5daefc96SJeremy L Thompson     code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2505*5daefc96SJeremy L Thompson   }
2506*5daefc96SJeremy L Thompson   code << "};\n";
2507*5daefc96SJeremy L Thompson   code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {";
2508*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_active_in; i++) {
2509*5daefc96SJeremy L Thompson     code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
2510*5daefc96SJeremy L Thompson   }
2511*5daefc96SJeremy L Thompson   code << "};\n";
2512*5daefc96SJeremy L Thompson 
2513*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
2514*5daefc96SJeremy L Thompson     bool is_active = false;
2515*5daefc96SJeremy L Thompson 
2516*5daefc96SJeremy L Thompson     {
2517*5daefc96SJeremy L Thompson       CeedVector vec;
2518*5daefc96SJeremy L Thompson 
2519*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2520*5daefc96SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
2521*5daefc96SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
2522*5daefc96SJeremy L Thompson     }
2523*5daefc96SJeremy L Thompson     if (is_active) {
2524*5daefc96SJeremy L Thompson       const char *field_name;
2525*5daefc96SJeremy L Thompson       CeedInt     field_size;
2526*5daefc96SJeremy L Thompson 
2527*5daefc96SJeremy L Thompson       active_fields_out[num_active_out] = i;
2528*5daefc96SJeremy L Thompson       num_active_out++;
2529*5daefc96SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2530*5daefc96SJeremy L Thompson       qf_assembly_size_out += field_size;
2531*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2532*5daefc96SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2533*5daefc96SJeremy L Thompson       code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n";
2534*5daefc96SJeremy L Thompson     }
2535*5daefc96SJeremy L Thompson   }
2536*5daefc96SJeremy L Thompson   code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {";
2537*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_active_out; i++) {
2538*5daefc96SJeremy L Thompson     code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : "");
2539*5daefc96SJeremy L Thompson   }
2540*5daefc96SJeremy L Thompson   code << "};\n";
2541*5daefc96SJeremy L Thompson   code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n";
2542*5daefc96SJeremy L Thompson 
2543*5daefc96SJeremy L Thompson   // -- Loop over active field
2544*5daefc96SJeremy L Thompson   code << "\n" << tab << "CeedInt input_offset = 0;\n";
2545*5daefc96SJeremy L Thompson   code << tab << "// Loop over active QFunction input fields\n";
2546*5daefc96SJeremy L Thompson   code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n";
2547*5daefc96SJeremy L Thompson   code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n";
2548*5daefc96SJeremy L Thompson   tab.push();
2549*5daefc96SJeremy L Thompson 
2550*5daefc96SJeremy L Thompson   // -- Loop over size of active field
2551*5daefc96SJeremy L Thompson   code << "\n" << tab << "// Loop over current active input field size\n";
2552*5daefc96SJeremy L Thompson   code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n";
2553*5daefc96SJeremy L Thompson   code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n";
2554*5daefc96SJeremy L Thompson   tab.push();
2555*5daefc96SJeremy L Thompson 
2556*5daefc96SJeremy L Thompson   // -- Set current active point and component to 1
2557*5daefc96SJeremy L Thompson   code << tab << "// Set current active point and component to 1.0\n";
2558*5daefc96SJeremy L Thompson   if (is_all_tensor && (max_dim >= 3)) {
2559*5daefc96SJeremy L Thompson     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n";
2560*5daefc96SJeremy L Thompson   } else {
2561*5daefc96SJeremy L Thompson     code << tab << "r_q_in[a][s] = 1.0;\n";
2562*5daefc96SJeremy L Thompson   }
2563*5daefc96SJeremy L Thompson 
2564*5daefc96SJeremy L Thompson   // -- Q function
2565*5daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
2566*5daefc96SJeremy L Thompson                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
2567*5daefc96SJeremy L Thompson                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
2568*5daefc96SJeremy L Thompson 
2569*5daefc96SJeremy L Thompson   // -- Output basis and restriction
2570*5daefc96SJeremy L Thompson   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
2571*5daefc96SJeremy L Thompson   CeedScalar offset = 0;
2572*5daefc96SJeremy L Thompson 
2573*5daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
2574*5daefc96SJeremy L Thompson     bool        is_active = false;
2575*5daefc96SJeremy L Thompson     const char *field_name;
2576*5daefc96SJeremy L Thompson 
2577*5daefc96SJeremy L Thompson     {
2578*5daefc96SJeremy L Thompson       CeedVector vec;
2579*5daefc96SJeremy L Thompson 
2580*5daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2581*5daefc96SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
2582*5daefc96SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
2583*5daefc96SJeremy L Thompson     }
2584*5daefc96SJeremy L Thompson     if (!is_active) continue;
2585*5daefc96SJeremy L Thompson 
2586*5daefc96SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
2587*5daefc96SJeremy L Thompson     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
2588*5daefc96SJeremy L Thompson 
2589*5daefc96SJeremy L Thompson     // ---- Restriction
2590*5daefc96SJeremy L Thompson     CeedInt field_size;
2591*5daefc96SJeremy L Thompson 
2592*5daefc96SJeremy L Thompson     code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
2593*5daefc96SJeremy L Thompson          << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
2594*5daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
2595*5daefc96SJeremy L Thompson     offset += field_size;
2596*5daefc96SJeremy L Thompson   }
2597*5daefc96SJeremy L Thompson 
2598*5daefc96SJeremy L Thompson   // -- Reset current active node and component
2599*5daefc96SJeremy L Thompson   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2600*5daefc96SJeremy L Thompson   if (is_all_tensor && (max_dim >= 3)) {
2601*5daefc96SJeremy L Thompson     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n";
2602*5daefc96SJeremy L Thompson   } else {
2603*5daefc96SJeremy L Thompson     code << tab << "r_q_in[a][s] = 0.0;\n";
2604*5daefc96SJeremy L Thompson   }
2605*5daefc96SJeremy L Thompson 
2606*5daefc96SJeremy L Thompson   // -- End of loop over size of active field
2607*5daefc96SJeremy L Thompson   tab.pop();
2608*5daefc96SJeremy L Thompson   code << tab << "}\n";
2609*5daefc96SJeremy L Thompson   code << tab << "input_offset += field_size_in;\n";
2610*5daefc96SJeremy L Thompson 
2611*5daefc96SJeremy L Thompson   // -- End of loop over active field
2612*5daefc96SJeremy L Thompson   tab.pop();
2613*5daefc96SJeremy L Thompson   code << tab << "}\n";
2614*5daefc96SJeremy L Thompson 
2615*5daefc96SJeremy L Thompson   // Close loop and function
2616*5daefc96SJeremy L Thompson   tab.pop();
2617*5daefc96SJeremy L Thompson   code << tab << "}\n";
2618*5daefc96SJeremy L Thompson   tab.pop();
2619*5daefc96SJeremy L Thompson   code << tab << "}\n";
2620*5daefc96SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n\n";
2621*5daefc96SJeremy L Thompson 
2622*5daefc96SJeremy L Thompson   CeedInt block_sizes[3] = {0, 0, 0};
2623*5daefc96SJeremy L Thompson   CeedInt num_elem;
2624*5daefc96SJeremy L Thompson 
2625*5daefc96SJeremy L Thompson   // Compile
2626*5daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2627*5daefc96SJeremy L Thompson   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2628*5daefc96SJeremy L Thompson   block_sizes[2] = 1;
2629*5daefc96SJeremy L Thompson   {
2630*5daefc96SJeremy L Thompson     bool is_compile_good = false;
2631*5daefc96SJeremy L Thompson 
2632*5daefc96SJeremy L Thompson     data->thread_1d = block_sizes[0];
2633*5daefc96SJeremy L Thompson     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0],
2634*5daefc96SJeremy L Thompson                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2635*5daefc96SJeremy L Thompson     if (is_compile_good) {
2636*5daefc96SJeremy L Thompson       *is_good_build = true;
2637*5daefc96SJeremy L Thompson       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
2638*5daefc96SJeremy L Thompson     } else {
2639*5daefc96SJeremy L Thompson       *is_good_build              = false;
2640*5daefc96SJeremy L Thompson       data->use_assembly_fallback = true;
2641*5daefc96SJeremy L Thompson     }
2642*5daefc96SJeremy L Thompson   }
2643*5daefc96SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
2644*5daefc96SJeremy L Thompson   CeedCallBackend(CeedQFunctionDestroy(&qf));
2645*5daefc96SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2646*5daefc96SJeremy L Thompson }
2647692716b7SZach Atkins 
26480183ed61SJeremy L Thompson //------------------------------------------------------------------------------
2649