xref: /libCEED/rust/libceed-sys/c-src/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision a49e5d53e180225109bfad71df325c7cfa170c69)
19ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
37d8d0e25Snbeams //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
57d8d0e25Snbeams //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
73d576824SJeremy L Thompson 
87d8d0e25Snbeams #define CEED_DEBUG_COLOR 12
97d8d0e25Snbeams 
1049aac155SJeremy L Thompson #include <ceed.h>
11ec3da8bcSJed Brown #include <ceed/backend.h>
120183ed61SJeremy L Thompson #include <ceed/gen-tools.h>
139e201c85SYohann #include <ceed/jit-tools.h>
142b730f8bSJeremy L Thompson 
157d8d0e25Snbeams #include <iostream>
167d8d0e25Snbeams #include <sstream>
172b730f8bSJeremy L Thompson #include <string>
182b730f8bSJeremy L Thompson 
190d0321e0SJeremy L Thompson #include "../hip-ref/ceed-hip-ref.h"
207d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h"
21b2165e7aSSebastian Grimberg #include "../hip/ceed-hip-common.h"
227d8d0e25Snbeams #include "../hip/ceed-hip-compile.h"
232b730f8bSJeremy L Thompson #include "ceed-hip-gen.h"
24b3e1519bSnbeams 
2545a787f7SJeremy L Thompson struct FieldReuse_Hip {
2645a787f7SJeremy L Thompson   CeedInt      index;
2745a787f7SJeremy L Thompson   bool         is_input;
2845a787f7SJeremy L Thompson   CeedEvalMode eval_mode;
2945a787f7SJeremy L Thompson };
3045a787f7SJeremy L Thompson 
31b3e1519bSnbeams //------------------------------------------------------------------------------
32b3e1519bSnbeams // Calculate the block size used for launching the operator kernel
33b3e1519bSnbeams //------------------------------------------------------------------------------
BlockGridCalculate_Hip_gen(const CeedInt dim,const CeedInt num_elem,const CeedInt P_1d,const CeedInt Q_1d,CeedInt * block_sizes)342b730f8bSJeremy L Thompson extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
353a2968d6SJeremy L Thompson   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
36b3e1519bSnbeams   if (dim == 1) {
373a2968d6SJeremy L Thompson     CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
38b7453713SJeremy L Thompson 
399e201c85SYohann     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
403a2968d6SJeremy L Thompson     block_sizes[0]  = thread_1d;
41b3e1519bSnbeams     block_sizes[1]  = 1;
429e201c85SYohann     block_sizes[2]  = elems_per_block;
43b3e1519bSnbeams   } else if (dim == 2) {
443a2968d6SJeremy L Thompson     const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2;
45b7453713SJeremy L Thompson 
463a2968d6SJeremy L Thompson     block_sizes[0] = thread_1d;
473a2968d6SJeremy L Thompson     block_sizes[1] = thread_1d;
489e201c85SYohann     block_sizes[2] = elems_per_block;
49b3e1519bSnbeams   } else if (dim == 3) {
503a2968d6SJeremy L Thompson     const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1);
51b7453713SJeremy L Thompson 
523a2968d6SJeremy L Thompson     block_sizes[0] = thread_1d;
533a2968d6SJeremy L Thompson     block_sizes[1] = thread_1d;
549e201c85SYohann     block_sizes[2] = elems_per_block;
55b3e1519bSnbeams   }
56b3e1519bSnbeams   return CEED_ERROR_SUCCESS;
57b3e1519bSnbeams }
58b3e1519bSnbeams 
597d8d0e25Snbeams //------------------------------------------------------------------------------
604b3e95d5SJeremy L Thompson // Determine type of operator
614b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
CeedOperatorBuildKernelData_Hip_gen(Ceed ceed,CeedInt num_input_fields,CeedOperatorField * op_input_fields,CeedQFunctionField * qf_input_fields,CeedInt num_output_fields,CeedOperatorField * op_output_fields,CeedQFunctionField * qf_output_fields,CeedInt * max_P,CeedInt * max_P_1d,CeedInt * Q,CeedInt * Q_1d,CeedInt * max_dim,bool * is_all_tensor,bool * use_3d_slices)624b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
634b3e95d5SJeremy L Thompson                                                CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
6474398b5aSJeremy L Thompson                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
6574398b5aSJeremy L Thompson                                                CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
6674398b5aSJeremy L Thompson   // Check if all are tensor
6774398b5aSJeremy L Thompson   *is_all_tensor = true;
684b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
694b3e95d5SJeremy L Thompson     CeedBasis basis;
704b3e95d5SJeremy L Thompson 
714b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
724b3e95d5SJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
734b3e95d5SJeremy L Thompson       bool is_field_tensor;
744b3e95d5SJeremy L Thompson 
754b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
7674398b5aSJeremy L Thompson       *is_all_tensor = *is_all_tensor && is_field_tensor;
774b3e95d5SJeremy L Thompson     }
783a2968d6SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis));
794b3e95d5SJeremy L Thompson   }
804b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
814b3e95d5SJeremy L Thompson     CeedBasis basis;
824b3e95d5SJeremy L Thompson 
834b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
844b3e95d5SJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
854b3e95d5SJeremy L Thompson       bool is_field_tensor;
864b3e95d5SJeremy L Thompson 
874b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
8874398b5aSJeremy L Thompson       *is_all_tensor = *is_all_tensor && is_field_tensor;
8974398b5aSJeremy L Thompson     }
9074398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis));
9174398b5aSJeremy L Thompson   }
9274398b5aSJeremy L Thompson 
9374398b5aSJeremy L Thompson   // Find max_P, max_P_1d, Q, and Q_1d
9474398b5aSJeremy L Thompson   bool is_all_3d = true;
9574398b5aSJeremy L Thompson 
9674398b5aSJeremy L Thompson   *max_P    = 0;
9774398b5aSJeremy L Thompson   *max_P_1d = 0;
9874398b5aSJeremy L Thompson   *Q        = 0;
9974398b5aSJeremy L Thompson   *Q_1d     = 0;
10074398b5aSJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
10174398b5aSJeremy L Thompson     CeedBasis basis;
10274398b5aSJeremy L Thompson 
10374398b5aSJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
10474398b5aSJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
10574398b5aSJeremy L Thompson       bool    is_field_tensor;
10674398b5aSJeremy L Thompson       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
10774398b5aSJeremy L Thompson 
10874398b5aSJeremy L Thompson       // Check if 3D
1094b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
11074398b5aSJeremy L Thompson       is_all_3d = is_all_3d && (field_dim == 3);
11174398b5aSJeremy L Thompson       *max_dim  = CeedIntMax(*max_dim, field_dim);
11274398b5aSJeremy L Thompson 
11374398b5aSJeremy L Thompson       // Collect P, P_1d, Q, and Q_1d
11474398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
11574398b5aSJeremy L Thompson       *max_P = CeedIntMax(*max_P, field_P);
11674398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
11774398b5aSJeremy L Thompson       if (is_field_tensor) {
11874398b5aSJeremy L Thompson         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
11974398b5aSJeremy L Thompson         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
12074398b5aSJeremy L Thompson       }
12174398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
12274398b5aSJeremy L Thompson       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
12374398b5aSJeremy L Thompson       *Q = field_Q;
12474398b5aSJeremy L Thompson       if (is_field_tensor) {
12574398b5aSJeremy L Thompson         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
1264b3e95d5SJeremy L Thompson         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
1274b3e95d5SJeremy L Thompson         *Q_1d = field_Q_1d;
1284b3e95d5SJeremy L Thompson       }
12974398b5aSJeremy L Thompson     }
13074398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis));
13174398b5aSJeremy L Thompson   }
13274398b5aSJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
13374398b5aSJeremy L Thompson     CeedBasis basis;
13474398b5aSJeremy L Thompson 
13574398b5aSJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
13674398b5aSJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
13774398b5aSJeremy L Thompson       bool    is_field_tensor;
13874398b5aSJeremy L Thompson       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
13974398b5aSJeremy L Thompson 
14074398b5aSJeremy L Thompson       // Check if 3D
14174398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
14274398b5aSJeremy L Thompson       is_all_3d = is_all_3d && (field_dim == 3);
14374398b5aSJeremy L Thompson       *max_dim  = CeedIntMax(*max_dim, field_dim);
14474398b5aSJeremy L Thompson 
14574398b5aSJeremy L Thompson       // Collect P, P_1d, Q, and Q_1d
14674398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
14774398b5aSJeremy L Thompson       *max_P = CeedIntMax(*max_P, field_P);
14874398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
14974398b5aSJeremy L Thompson       if (is_field_tensor) {
15074398b5aSJeremy L Thompson         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
15174398b5aSJeremy L Thompson         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
15274398b5aSJeremy L Thompson       }
15374398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
15474398b5aSJeremy L Thompson       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
15574398b5aSJeremy L Thompson       *Q = field_Q;
15674398b5aSJeremy L Thompson       if (is_field_tensor) {
15774398b5aSJeremy L Thompson         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
15874398b5aSJeremy L Thompson         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
15974398b5aSJeremy L Thompson         *Q_1d = field_Q_1d;
16074398b5aSJeremy L Thompson       }
16174398b5aSJeremy L Thompson     }
1623a2968d6SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis));
1634b3e95d5SJeremy L Thompson   }
1644b3e95d5SJeremy L Thompson 
1654b3e95d5SJeremy L Thompson   // Only use 3D collocated gradient parallelization strategy when gradient is computed
1664b3e95d5SJeremy L Thompson   *use_3d_slices = false;
16774398b5aSJeremy L Thompson   if (is_all_3d && *is_all_tensor) {
1684b3e95d5SJeremy L Thompson     bool was_grad_found = false;
1694b3e95d5SJeremy L Thompson 
1704b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
1714b3e95d5SJeremy L Thompson       CeedEvalMode eval_mode;
1724b3e95d5SJeremy L Thompson 
1734b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1744b3e95d5SJeremy L Thompson       if (eval_mode == CEED_EVAL_GRAD) {
1754b3e95d5SJeremy L Thompson         CeedBasis_Hip_shared *basis_data;
1764b3e95d5SJeremy L Thompson         CeedBasis             basis;
1774b3e95d5SJeremy L Thompson 
1784b3e95d5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1794b3e95d5SJeremy L Thompson         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
1804b3e95d5SJeremy L Thompson         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
1814b3e95d5SJeremy L Thompson         was_grad_found = true;
1823a2968d6SJeremy L Thompson         CeedCallBackend(CeedBasisDestroy(&basis));
1834b3e95d5SJeremy L Thompson       }
1844b3e95d5SJeremy L Thompson     }
1854b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
1864b3e95d5SJeremy L Thompson       CeedEvalMode eval_mode;
1874b3e95d5SJeremy L Thompson 
1884b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1894b3e95d5SJeremy L Thompson       if (eval_mode == CEED_EVAL_GRAD) {
1904b3e95d5SJeremy L Thompson         CeedBasis_Hip_shared *basis_data;
1914b3e95d5SJeremy L Thompson         CeedBasis             basis;
1924b3e95d5SJeremy L Thompson 
1934b3e95d5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1944b3e95d5SJeremy L Thompson         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
1954b3e95d5SJeremy L Thompson         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
1964b3e95d5SJeremy L Thompson         was_grad_found = true;
1973a2968d6SJeremy L Thompson         CeedCallBackend(CeedBasisDestroy(&basis));
1984b3e95d5SJeremy L Thompson       }
1994b3e95d5SJeremy L Thompson     }
2004b3e95d5SJeremy L Thompson   }
2014b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2024b3e95d5SJeremy L Thompson }
2034b3e95d5SJeremy L Thompson 
2044b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
2054b3e95d5SJeremy L Thompson // Setup fields
2064b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream & code,CeedOperator_Hip_gen * data,Tab & tab,CeedInt i,CeedOperatorField op_field,CeedQFunctionField qf_field,FieldReuse_Hip field_reuse,CeedInt max_dim,CeedInt Q,CeedInt Q_1d,bool is_input,bool is_all_tensor,bool is_at_points,bool use_3d_slices,bool skip_active_load)2070183ed61SJeremy L Thompson static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
2080183ed61SJeremy L Thompson                                                     CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
2090183ed61SJeremy L Thompson                                                     CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
210ca1da9b9SJeremy L Thompson                                                     bool use_3d_slices, bool skip_active_load) {
211ca1da9b9SJeremy L Thompson   bool      is_tensor = true, is_active = true;
21274398b5aSJeremy L Thompson   CeedBasis basis;
213ca1da9b9SJeremy L Thompson 
21474398b5aSJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
21574398b5aSJeremy L Thompson   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
216ca1da9b9SJeremy L Thompson   {
217ca1da9b9SJeremy L Thompson     CeedVector vec;
218ca1da9b9SJeremy L Thompson 
219ca1da9b9SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetVector(op_field, &vec));
220ca1da9b9SJeremy L Thompson     is_active = vec == CEED_VECTOR_ACTIVE;
221ca1da9b9SJeremy L Thompson     CeedCallBackend(CeedVectorDestroy(&vec));
222ca1da9b9SJeremy L Thompson   }
22374398b5aSJeremy L Thompson 
22459fa3f92SJeremy L Thompson   const char           *field_name;
2254b3e95d5SJeremy L Thompson   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
2269123fb08SJeremy L Thompson   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
2274b3e95d5SJeremy L Thompson   std::string           option_name = (is_input ? "inputs" : "outputs");
2284b3e95d5SJeremy L Thompson   CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
22974398b5aSJeremy L Thompson   CeedInt               elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
2304b3e95d5SJeremy L Thompson   CeedElemRestriction   elem_rstr;
2314b3e95d5SJeremy L Thompson   CeedBasis_Hip_shared *basis_data;
2324b3e95d5SJeremy L Thompson 
2339ee499e5SJeremy L Thompson   // Field reuse info
23445a787f7SJeremy L Thompson   bool use_previous_field = field_reuse.index != -1;
2359ee499e5SJeremy L Thompson 
23659fa3f92SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
2370183ed61SJeremy L Thompson   code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
2384b3e95d5SJeremy L Thompson 
2394b3e95d5SJeremy L Thompson   // Get field data
2404b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
2414b3e95d5SJeremy L Thompson   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
2424b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
2434b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2444b3e95d5SJeremy L Thompson   }
2453a2968d6SJeremy L Thompson   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2464b3e95d5SJeremy L Thompson   if (basis != CEED_BASIS_NONE) {
2474b3e95d5SJeremy L Thompson     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
24874398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
2499123fb08SJeremy L Thompson     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
2509123fb08SJeremy L Thompson     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
2514b3e95d5SJeremy L Thompson   }
2524b3e95d5SJeremy L Thompson   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
2534b3e95d5SJeremy L Thompson 
2544b3e95d5SJeremy L Thompson   // Set field constants
2550183ed61SJeremy L Thompson   code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
25674398b5aSJeremy L Thompson   if (is_tensor && !is_all_tensor) {
25774398b5aSJeremy L Thompson     CeedInt P = 0;
25874398b5aSJeremy L Thompson 
25974398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
2600183ed61SJeremy L Thompson     code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
26174398b5aSJeremy L Thompson   }
2620183ed61SJeremy L Thompson   code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
263343e3094SJeremy L Thompson   if (eval_mode != CEED_EVAL_WEIGHT) {
2640183ed61SJeremy L Thompson     code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
2654b3e95d5SJeremy L Thompson   }
2664b3e95d5SJeremy L Thompson 
2674b3e95d5SJeremy L Thompson   // Load basis data
2680183ed61SJeremy L Thompson   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
2694b3e95d5SJeremy L Thompson   switch (eval_mode) {
2704b3e95d5SJeremy L Thompson     case CEED_EVAL_NONE:
2714b3e95d5SJeremy L Thompson       break;
2724b3e95d5SJeremy L Thompson     case CEED_EVAL_INTERP:
2733a2968d6SJeremy L Thompson       if (is_at_points) {
2743a2968d6SJeremy L Thompson         // AtPoints
2753a2968d6SJeremy L Thompson         if (!basis_data->d_chebyshev_interp_1d) {
2763a2968d6SJeremy L Thompson           CeedSize    interp_bytes;
2773a2968d6SJeremy L Thompson           CeedScalar *chebyshev_interp_1d;
2783a2968d6SJeremy L Thompson 
2793a2968d6SJeremy L Thompson           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
2803a2968d6SJeremy L Thompson           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
2813a2968d6SJeremy L Thompson           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
2823a2968d6SJeremy L Thompson           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
2833a2968d6SJeremy L Thompson           CeedCallHip(CeedBasisReturnCeed(basis),
2843a2968d6SJeremy L Thompson                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
2853a2968d6SJeremy L Thompson           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
2863a2968d6SJeremy L Thompson         }
2873a2968d6SJeremy L Thompson         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
2883a2968d6SJeremy L Thompson         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
2893a2968d6SJeremy L Thompson       } else {
2903a2968d6SJeremy L Thompson         // Standard quadrature
2914b3e95d5SJeremy L Thompson         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
2924b3e95d5SJeremy L Thompson         else data->B.outputs[i] = basis_data->d_interp_1d;
2933a2968d6SJeremy L Thompson       }
294ca1da9b9SJeremy L Thompson       if (use_previous_field && !skip_active_load) {
29545a787f7SJeremy L Thompson         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
2969ee499e5SJeremy L Thompson 
2970183ed61SJeremy L Thompson         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
2989ee499e5SJeremy L Thompson       } else {
2990ccda8ebSJeremy L Thompson         bool is_collocated = false;
3000ccda8ebSJeremy L Thompson 
3010ccda8ebSJeremy L Thompson         CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
302ca1da9b9SJeremy L Thompson         if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
3030ccda8ebSJeremy L Thompson           code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
3040ccda8ebSJeremy L Thompson         } else {
3050183ed61SJeremy L Thompson           code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
3060183ed61SJeremy L Thompson           code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
3079ee499e5SJeremy L Thompson         }
3080ccda8ebSJeremy L Thompson       }
3094b3e95d5SJeremy L Thompson       break;
3104b3e95d5SJeremy L Thompson     case CEED_EVAL_GRAD:
3113a2968d6SJeremy L Thompson       if (is_at_points) {
3123a2968d6SJeremy L Thompson         // AtPoints
3133a2968d6SJeremy L Thompson         if (!basis_data->d_chebyshev_interp_1d) {
3143a2968d6SJeremy L Thompson           CeedSize    interp_bytes;
3153a2968d6SJeremy L Thompson           CeedScalar *chebyshev_interp_1d;
3163a2968d6SJeremy L Thompson 
3173a2968d6SJeremy L Thompson           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
3183a2968d6SJeremy L Thompson           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
3193a2968d6SJeremy L Thompson           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
3203a2968d6SJeremy L Thompson           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
3213a2968d6SJeremy L Thompson           CeedCallHip(CeedBasisReturnCeed(basis),
3223a2968d6SJeremy L Thompson                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
3233a2968d6SJeremy L Thompson           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
3243a2968d6SJeremy L Thompson         }
3253a2968d6SJeremy L Thompson         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
3263a2968d6SJeremy L Thompson         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
3273a2968d6SJeremy L Thompson       } else {
3283a2968d6SJeremy L Thompson         // Standard quadrature
3294b3e95d5SJeremy L Thompson         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
3304b3e95d5SJeremy L Thompson         else data->B.outputs[i] = basis_data->d_interp_1d;
3313a2968d6SJeremy L Thompson       }
3329123fb08SJeremy L Thompson       if (is_tensor) {
333ca1da9b9SJeremy L Thompson         if (use_previous_field && !skip_active_load) {
33445a787f7SJeremy L Thompson           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
3359ee499e5SJeremy L Thompson 
3360183ed61SJeremy L Thompson           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
3379ee499e5SJeremy L Thompson         } else {
3380ccda8ebSJeremy L Thompson           bool is_collocated = false;
3390ccda8ebSJeremy L Thompson 
3400ccda8ebSJeremy L Thompson           CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
341ca1da9b9SJeremy L Thompson           if ((is_active && skip_active_load) || (is_collocated && !is_at_points)) {
3420ccda8ebSJeremy L Thompson             code << tab << "CeedScalar *s_B" << var_suffix << " = NULL;\n";
3430ccda8ebSJeremy L Thompson           } else {
3440183ed61SJeremy L Thompson             code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
3450183ed61SJeremy L Thompson             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
3469123fb08SJeremy L Thompson           }
3479ee499e5SJeremy L Thompson         }
3480ccda8ebSJeremy L Thompson       }
3493a2968d6SJeremy L Thompson       if (is_at_points) break;  // No G mat for AtPoints
3504b3e95d5SJeremy L Thompson       if (use_3d_slices) {
3514b3e95d5SJeremy L Thompson         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
3524b3e95d5SJeremy L Thompson         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
353ca1da9b9SJeremy L Thompson         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
35445a787f7SJeremy L Thompson           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
3559ee499e5SJeremy L Thompson 
3560183ed61SJeremy L Thompson           code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
357ca1da9b9SJeremy L Thompson         } else if (is_active && skip_active_load) {
358ca1da9b9SJeremy L Thompson           code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
3599ee499e5SJeremy L Thompson         } else {
3600183ed61SJeremy L Thompson           code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
3610183ed61SJeremy L Thompson           code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
3629ee499e5SJeremy L Thompson         }
3634b3e95d5SJeremy L Thompson       } else {
3644b3e95d5SJeremy L Thompson         bool has_collo_grad = basis_data->d_collo_grad_1d;
3654b3e95d5SJeremy L Thompson 
3664b3e95d5SJeremy L Thompson         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
3674b3e95d5SJeremy L Thompson         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
3684b3e95d5SJeremy L Thompson         if (has_collo_grad) {
369ca1da9b9SJeremy L Thompson           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
37045a787f7SJeremy L Thompson             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
3719ee499e5SJeremy L Thompson 
3720183ed61SJeremy L Thompson             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
373ca1da9b9SJeremy L Thompson           } else if (is_active && skip_active_load) {
374ca1da9b9SJeremy L Thompson             code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
3759ee499e5SJeremy L Thompson           } else {
3760183ed61SJeremy L Thompson             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
3770183ed61SJeremy L Thompson             code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
3789ee499e5SJeremy L Thompson           }
3799ee499e5SJeremy L Thompson         } else {
380ca1da9b9SJeremy L Thompson           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD && !skip_active_load) {
38145a787f7SJeremy L Thompson             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
3829ee499e5SJeremy L Thompson 
3830183ed61SJeremy L Thompson             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
384ca1da9b9SJeremy L Thompson           } else if (is_active && skip_active_load) {
385ca1da9b9SJeremy L Thompson             code << tab << "CeedScalar *s_G" << var_suffix << " = NULL;\n";
3864b3e95d5SJeremy L Thompson           } else {
3870183ed61SJeremy L Thompson             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
38874398b5aSJeremy L Thompson                  << (is_tensor ? "" : var_suffix) << "];\n";
3890183ed61SJeremy L Thompson             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
39074398b5aSJeremy L Thompson                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
3914b3e95d5SJeremy L Thompson           }
3924b3e95d5SJeremy L Thompson         }
3939ee499e5SJeremy L Thompson       }
3944b3e95d5SJeremy L Thompson       break;
3954b3e95d5SJeremy L Thompson     case CEED_EVAL_WEIGHT:
3964b3e95d5SJeremy L Thompson       break;  // No action
3974b3e95d5SJeremy L Thompson       // LCOV_EXCL_START
3984b3e95d5SJeremy L Thompson     case CEED_EVAL_DIV:
3994b3e95d5SJeremy L Thompson     case CEED_EVAL_CURL:
4004b3e95d5SJeremy L Thompson       break;  // TODO: Not implemented
4014b3e95d5SJeremy L Thompson               // LCOV_EXCL_STOP
4024b3e95d5SJeremy L Thompson   }
4033a2968d6SJeremy L Thompson   CeedCallBackend(CeedBasisDestroy(&basis));
4044b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
4054b3e95d5SJeremy L Thompson }
4064b3e95d5SJeremy L Thompson 
4074b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
4084b3e95d5SJeremy L Thompson // Restriction
4094b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream & code,CeedOperator_Hip_gen * data,Tab & tab,CeedInt i,CeedInt field_input_buffer[],CeedOperatorField op_field,CeedQFunctionField qf_field,CeedInt max_dim,CeedInt Q_1d,bool is_input,bool is_all_tensor,bool is_at_points,bool use_3d_slices)4100183ed61SJeremy L Thompson static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
4110183ed61SJeremy L Thompson                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
4120183ed61SJeremy L Thompson                                                       CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
4130183ed61SJeremy L Thompson                                                       bool use_3d_slices) {
4144b3e95d5SJeremy L Thompson   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
41574398b5aSJeremy L Thompson   std::string              P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
4164b3e95d5SJeremy L Thompson   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
41774398b5aSJeremy L Thompson   CeedInt                  elem_size = 0, num_comp = 0;
4184b3e95d5SJeremy L Thompson   CeedSize                 l_size;
419f815fac9SJeremy L Thompson   CeedRestrictionType      rstr_type = CEED_RESTRICTION_STANDARD;
4204b3e95d5SJeremy L Thompson   CeedElemRestriction_Hip *rstr_data;
4214b3e95d5SJeremy L Thompson   CeedElemRestriction      elem_rstr;
4224b3e95d5SJeremy L Thompson 
4234b3e95d5SJeremy L Thompson   // Get field data
4244b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
4254b3e95d5SJeremy L Thompson   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
426f815fac9SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
4274b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
4284b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
4294b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
4304b3e95d5SJeremy L Thompson   }
4314b3e95d5SJeremy L Thompson   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
4324b3e95d5SJeremy L Thompson 
4334b3e95d5SJeremy L Thompson   // Restriction
4344b3e95d5SJeremy L Thompson   if (is_input) {
4354b3e95d5SJeremy L Thompson     // Input
436e93651e5SJeremy L Thompson     if (field_input_buffer[i] != i) {
437e93651e5SJeremy L Thompson       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
438e93651e5SJeremy L Thompson 
439e93651e5SJeremy L Thompson       // Restriction was already done for previous input
4400183ed61SJeremy L Thompson       code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
4413a2968d6SJeremy L Thompson     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
4423a2968d6SJeremy L Thompson       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
443e93651e5SJeremy L Thompson         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
4440183ed61SJeremy L Thompson         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
4453a2968d6SJeremy L Thompson       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
446e93651e5SJeremy L Thompson         // Otherwise we're using the scratch space
4470183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
448e93651e5SJeremy L Thompson       }
449f815fac9SJeremy L Thompson       switch (rstr_type) {
450f815fac9SJeremy L Thompson         case CEED_RESTRICTION_STANDARD: {
4514b3e95d5SJeremy L Thompson           CeedInt comp_stride;
4524b3e95d5SJeremy L Thompson 
4534b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
4540183ed61SJeremy L Thompson           code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
4554b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
4560183ed61SJeremy L Thompson           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
4574b3e95d5SJeremy L Thompson           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
4580183ed61SJeremy L Thompson           code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
4590183ed61SJeremy L Thompson                << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
4600183ed61SJeremy L Thompson                << ");\n";
461f815fac9SJeremy L Thompson           break;
462f815fac9SJeremy L Thompson         }
463f815fac9SJeremy L Thompson         case CEED_RESTRICTION_STRIDED: {
4644b3e95d5SJeremy L Thompson           bool    has_backend_strides;
4654b3e95d5SJeremy L Thompson           CeedInt num_elem;
4664b3e95d5SJeremy L Thompson 
4674b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
4684b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
4694b3e95d5SJeremy L Thompson           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
4704b3e95d5SJeremy L Thompson 
4714b3e95d5SJeremy L Thompson           if (!has_backend_strides) {
4724b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
4734b3e95d5SJeremy L Thompson           }
4740183ed61SJeremy L Thompson           code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
4750183ed61SJeremy L Thompson                << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
4760183ed61SJeremy L Thompson           code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
4770183ed61SJeremy L Thompson                << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
4780183ed61SJeremy L Thompson                << var_suffix << ");\n";
479f815fac9SJeremy L Thompson           break;
480f815fac9SJeremy L Thompson         }
4813a2968d6SJeremy L Thompson         case CEED_RESTRICTION_POINTS: {
4823a2968d6SJeremy L Thompson           CeedInt comp_stride;
4833a2968d6SJeremy L Thompson 
4843a2968d6SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
4850183ed61SJeremy L Thompson           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
4863a2968d6SJeremy L Thompson           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
4873a2968d6SJeremy L Thompson           break;
4883a2968d6SJeremy L Thompson         }
489f815fac9SJeremy L Thompson         // LCOV_EXCL_START
490f815fac9SJeremy L Thompson         case CEED_RESTRICTION_ORIENTED:
491f815fac9SJeremy L Thompson         case CEED_RESTRICTION_CURL_ORIENTED:
492f815fac9SJeremy L Thompson           break;  // TODO: Not implemented
493f815fac9SJeremy L Thompson                   // LCOV_EXCL_STOP
4944b3e95d5SJeremy L Thompson       }
4954b3e95d5SJeremy L Thompson     }
4964b3e95d5SJeremy L Thompson   } else {
4974b3e95d5SJeremy L Thompson     // Output
498f815fac9SJeremy L Thompson     switch (rstr_type) {
499f815fac9SJeremy L Thompson       case CEED_RESTRICTION_STANDARD: {
5004b3e95d5SJeremy L Thompson         CeedInt comp_stride;
5014b3e95d5SJeremy L Thompson 
5024b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
5030183ed61SJeremy L Thompson         code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
5044b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
5050183ed61SJeremy L Thompson         code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
5064b3e95d5SJeremy L Thompson         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
5070183ed61SJeremy L Thompson         code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
5080183ed61SJeremy L Thompson              << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
5090183ed61SJeremy L Thompson              << ");\n";
510f815fac9SJeremy L Thompson         break;
511f815fac9SJeremy L Thompson       }
512f815fac9SJeremy L Thompson       case CEED_RESTRICTION_STRIDED: {
5134b3e95d5SJeremy L Thompson         bool    has_backend_strides;
5144b3e95d5SJeremy L Thompson         CeedInt num_elem;
5154b3e95d5SJeremy L Thompson 
5164b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
5174b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
5184b3e95d5SJeremy L Thompson         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
5194b3e95d5SJeremy L Thompson 
5204b3e95d5SJeremy L Thompson         if (!has_backend_strides) {
5214b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
5224b3e95d5SJeremy L Thompson         }
5230183ed61SJeremy L Thompson         code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
5240183ed61SJeremy L Thompson              << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
5250183ed61SJeremy L Thompson         code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
5260183ed61SJeremy L Thompson              << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
5270183ed61SJeremy L Thompson              << ");\n";
528f815fac9SJeremy L Thompson         break;
529f815fac9SJeremy L Thompson       }
5303a2968d6SJeremy L Thompson       case CEED_RESTRICTION_POINTS:
5313a2968d6SJeremy L Thompson         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
5323a2968d6SJeremy L Thompson         break;
533f815fac9SJeremy L Thompson       // LCOV_EXCL_START
534f815fac9SJeremy L Thompson       case CEED_RESTRICTION_ORIENTED:
535f815fac9SJeremy L Thompson       case CEED_RESTRICTION_CURL_ORIENTED:
536f815fac9SJeremy L Thompson         break;  // TODO: Not implemented
537f815fac9SJeremy L Thompson                 // LCOV_EXCL_STOP
5384b3e95d5SJeremy L Thompson     }
5394b3e95d5SJeremy L Thompson   }
5403a2968d6SJeremy L Thompson   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
5414b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
5424b3e95d5SJeremy L Thompson }
5434b3e95d5SJeremy L Thompson 
5444b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
5454b3e95d5SJeremy L Thompson // Basis
5464b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream & code,CeedOperator_Hip_gen * data,Tab & tab,CeedInt i,CeedOperatorField op_field,CeedQFunctionField qf_field,CeedInt max_dim,CeedInt Q_1d,bool is_input,bool is_all_tensor,bool is_at_points,bool use_3d_slices)5470183ed61SJeremy L Thompson static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field,
54874398b5aSJeremy L Thompson                                                 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
5493a2968d6SJeremy L Thompson                                                 bool is_at_points, bool use_3d_slices) {
5500ccda8ebSJeremy L Thompson   bool      is_tensor = true, is_collocated = true;
55174398b5aSJeremy L Thompson   CeedBasis basis;
55274398b5aSJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
55374398b5aSJeremy L Thompson   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
5540ccda8ebSJeremy L Thompson   CeedCallBackend(CeedBasisIsCollocated(basis, &is_collocated));
55574398b5aSJeremy L Thompson 
5564b3e95d5SJeremy L Thompson   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
5579123fb08SJeremy L Thompson   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
5584b3e95d5SJeremy L Thompson   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
55974398b5aSJeremy L Thompson   CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
5604b3e95d5SJeremy L Thompson   CeedElemRestriction elem_rstr;
5614b3e95d5SJeremy L Thompson 
5624b3e95d5SJeremy L Thompson   // Get field data
5634b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
5644b3e95d5SJeremy L Thompson   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
5654b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
5664b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
5674b3e95d5SJeremy L Thompson   }
5683a2968d6SJeremy L Thompson   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
5694b3e95d5SJeremy L Thompson   if (basis != CEED_BASIS_NONE) {
57074398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
5719123fb08SJeremy L Thompson     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
5729123fb08SJeremy L Thompson     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
5734b3e95d5SJeremy L Thompson   }
5744b3e95d5SJeremy L Thompson   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
5754b3e95d5SJeremy L Thompson 
5764b3e95d5SJeremy L Thompson   // Basis
5770183ed61SJeremy L Thompson   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
5784b3e95d5SJeremy L Thompson   if (is_input) {
5794b3e95d5SJeremy L Thompson     switch (eval_mode) {
5804b3e95d5SJeremy L Thompson       case CEED_EVAL_NONE:
5813a2968d6SJeremy L Thompson         if (!use_3d_slices && !is_at_points) {
5820183ed61SJeremy L Thompson           code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
5834b3e95d5SJeremy L Thompson         }
5844b3e95d5SJeremy L Thompson         break;
5854b3e95d5SJeremy L Thompson       case CEED_EVAL_INTERP:
5863a2968d6SJeremy L Thompson         if (is_at_points) {
5879123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
5889123fb08SJeremy L Thompson 
5890183ed61SJeremy L Thompson           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
5900183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
5916b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
5923a2968d6SJeremy L Thompson         } else {
5930ccda8ebSJeremy L Thompson           std::string function_name = is_tensor ? ((dim == 1 ? "Interp" : "InterpTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
5940ccda8ebSJeremy L Thompson                                                    std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
59574398b5aSJeremy L Thompson                                                 : "InterpNonTensor";
59674398b5aSJeremy L Thompson           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
5979123fb08SJeremy L Thompson 
5980183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
5990183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
60074398b5aSJeremy L Thompson                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
6013a2968d6SJeremy L Thompson         }
6024b3e95d5SJeremy L Thompson         break;
6034b3e95d5SJeremy L Thompson       case CEED_EVAL_GRAD:
6043a2968d6SJeremy L Thompson         if (is_at_points) {
6059123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
6069123fb08SJeremy L Thompson 
6070183ed61SJeremy L Thompson           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
6080183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
6096b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
6103a2968d6SJeremy L Thompson         } else if (use_3d_slices) {
6110ccda8ebSJeremy L Thompson           std::string function_name =
6120ccda8ebSJeremy L Thompson               (dim > 1 ? "InterpTensor" : "Interp") + std::string(is_collocated ? "CollocatedNodes" : "") + std::to_string(dim) + "d";
6139123fb08SJeremy L Thompson 
6140183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
6150183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
6166b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
6179123fb08SJeremy L Thompson         } else if (is_tensor) {
6180ccda8ebSJeremy L Thompson           bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
6190ccda8ebSJeremy L Thompson           std::string function_name =
6200ccda8ebSJeremy L Thompson               (dim == 1 ? "Grad" : ("GradTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
6210ccda8ebSJeremy L Thompson               std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
62274398b5aSJeremy L Thompson           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
6239123fb08SJeremy L Thompson 
6240183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
62574398b5aSJeremy L Thompson                << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
6260183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
62774398b5aSJeremy L Thompson                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
6284b3e95d5SJeremy L Thompson         } else {
6299123fb08SJeremy L Thompson           std::string function_name = "GradNonTensor";
6309123fb08SJeremy L Thompson 
6310183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
6320183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
63374398b5aSJeremy L Thompson                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
6344b3e95d5SJeremy L Thompson         }
6354b3e95d5SJeremy L Thompson         break;
6364b3e95d5SJeremy L Thompson       case CEED_EVAL_WEIGHT: {
6373a2968d6SJeremy L Thompson         if (is_at_points) {
6380183ed61SJeremy L Thompson           code << tab << "// Nothing to do AtPoints\n";
6393a2968d6SJeremy L Thompson         } else {
6404b3e95d5SJeremy L Thompson           CeedBasis_Hip_shared *basis_data;
64174398b5aSJeremy L Thompson           std::string           function_name = is_tensor
64274398b5aSJeremy L Thompson                                                     ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
64374398b5aSJeremy L Thompson                                                     : "WeightNonTensor";
6444b3e95d5SJeremy L Thompson 
6450183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
6464b3e95d5SJeremy L Thompson           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
6474b3e95d5SJeremy L Thompson           data->W = basis_data->d_q_weight_1d;
6480183ed61SJeremy L Thompson           code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
6493a2968d6SJeremy L Thompson         }
6504b3e95d5SJeremy L Thompson         break;
6514b3e95d5SJeremy L Thompson       }
6524b3e95d5SJeremy L Thompson       // LCOV_EXCL_START
6534b3e95d5SJeremy L Thompson       case CEED_EVAL_DIV:
6544b3e95d5SJeremy L Thompson       case CEED_EVAL_CURL:
6554b3e95d5SJeremy L Thompson         break;  // TODO: Not implemented
6564b3e95d5SJeremy L Thompson                 // LCOV_EXCL_STOP
6574b3e95d5SJeremy L Thompson     }
6584b3e95d5SJeremy L Thompson   } else {
6594b3e95d5SJeremy L Thompson     switch (eval_mode) {
6604b3e95d5SJeremy L Thompson       case CEED_EVAL_NONE:
6610183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
6624b3e95d5SJeremy L Thompson         break;  // No action
6634b3e95d5SJeremy L Thompson       case CEED_EVAL_INTERP:
6640183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
6653a2968d6SJeremy L Thompson         if (is_at_points) {
6669123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
6679123fb08SJeremy L Thompson 
6680183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
6696b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
6703a2968d6SJeremy L Thompson         } else {
6719123fb08SJeremy L Thompson           std::string function_name =
6720ccda8ebSJeremy L Thompson               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
6730ccda8ebSJeremy L Thompson                            std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
67474398b5aSJeremy L Thompson                         : "InterpTransposeNonTensor";
67574398b5aSJeremy L Thompson           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
6769123fb08SJeremy L Thompson 
6770183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
67874398b5aSJeremy L Thompson                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
6793a2968d6SJeremy L Thompson         }
6804b3e95d5SJeremy L Thompson         break;
6814b3e95d5SJeremy L Thompson       case CEED_EVAL_GRAD:
6820183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
6833a2968d6SJeremy L Thompson         if (is_at_points) {
6849123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
6859123fb08SJeremy L Thompson 
6860183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
6876b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
6883a2968d6SJeremy L Thompson         } else if (use_3d_slices) {
6890ccda8ebSJeremy L Thompson           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::string(is_collocated ? "CollocatedNodes" : "") +
6900ccda8ebSJeremy L Thompson                                       std::to_string(dim) + "d";
6919123fb08SJeremy L Thompson 
6920183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
6936b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
6949123fb08SJeremy L Thompson         } else if (is_tensor) {
6950ccda8ebSJeremy L Thompson           bool        is_collocated_grad = dim == 3 && Q_1d >= P_1d;
6960ccda8ebSJeremy L Thompson           std::string function_name =
6970ccda8ebSJeremy L Thompson               (dim == 1 ? "GradTranspose"
6980ccda8ebSJeremy L Thompson                         : ("GradTransposeTensor" + std::string(is_collocated ? "CollocatedNodes" : (is_collocated_grad ? "Collocated" : "")))) +
69974398b5aSJeremy L Thompson               std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
70074398b5aSJeremy L Thompson           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
7019123fb08SJeremy L Thompson 
7020183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
70374398b5aSJeremy L Thompson                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
7044b3e95d5SJeremy L Thompson         } else {
7059123fb08SJeremy L Thompson           std::string function_name = "GradTransposeNonTensor";
7069123fb08SJeremy L Thompson 
7070183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
70874398b5aSJeremy L Thompson                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
7094b3e95d5SJeremy L Thompson         }
7104b3e95d5SJeremy L Thompson         break;
7114b3e95d5SJeremy L Thompson       // LCOV_EXCL_START
7124b3e95d5SJeremy L Thompson       case CEED_EVAL_WEIGHT:
7134b3e95d5SJeremy L Thompson         break;  // Should not occur
7144b3e95d5SJeremy L Thompson       case CEED_EVAL_DIV:
7154b3e95d5SJeremy L Thompson       case CEED_EVAL_CURL:
7164b3e95d5SJeremy L Thompson         break;  // TODO: Not implemented
7174b3e95d5SJeremy L Thompson                 // LCOV_EXCL_STOP
7184b3e95d5SJeremy L Thompson     }
7194b3e95d5SJeremy L Thompson   }
7203a2968d6SJeremy L Thompson   CeedCallBackend(CeedBasisDestroy(&basis));
7214b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
7224b3e95d5SJeremy L Thompson }
7234b3e95d5SJeremy L Thompson 
7244b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
7254b3e95d5SJeremy L Thompson // QFunction
7264b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream & code,CeedOperator_Hip_gen * data,Tab & tab,CeedInt max_dim,CeedInt max_num_points,CeedInt num_input_fields,CeedOperatorField * op_input_fields,CeedQFunctionField * qf_input_fields,CeedInt num_output_fields,CeedOperatorField * op_output_fields,CeedQFunctionField * qf_output_fields,std::string qfunction_name,CeedInt Q_1d,bool is_all_tensor,bool is_at_points,bool use_3d_slices,bool is_assemble)7270183ed61SJeremy L Thompson static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim,
7280183ed61SJeremy L Thompson                                                     CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
7290183ed61SJeremy L Thompson                                                     CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
7300183ed61SJeremy L Thompson                                                     CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
7310183ed61SJeremy L Thompson                                                     std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
732*745f16d1SZach Atkins                                                     bool use_3d_slices, bool is_assemble) {
73374398b5aSJeremy L Thompson   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
7344b3e95d5SJeremy L Thompson   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
7354b3e95d5SJeremy L Thompson   CeedElemRestriction elem_rstr;
7364b3e95d5SJeremy L Thompson 
7378b97b69aSJeremy L Thompson   // Setup output arrays
7380183ed61SJeremy L Thompson   code << "\n";
7390183ed61SJeremy L Thompson   code << tab << "// -- Output field setup\n";
7404b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
74159fa3f92SJeremy L Thompson     const char *field_name;
7424b3e95d5SJeremy L Thompson     std::string var_suffix = "_out_" + std::to_string(i);
7434b3e95d5SJeremy L Thompson 
74459fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
7450183ed61SJeremy L Thompson     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
7464b3e95d5SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
7473a2968d6SJeremy L Thompson     switch (eval_mode) {
7483a2968d6SJeremy L Thompson       case CEED_EVAL_NONE:
7493a2968d6SJeremy L Thompson         if (is_at_points) {
7500183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
7513a2968d6SJeremy L Thompson         } else {
7520183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
75374398b5aSJeremy L Thompson                << "];\n";
7544b3e95d5SJeremy L Thompson         }
7553a2968d6SJeremy L Thompson         break;
7563a2968d6SJeremy L Thompson       case CEED_EVAL_INTERP:
7573a2968d6SJeremy L Thompson         if (is_at_points) {
7583a2968d6SJeremy L Thompson           // Accumulator for point data
7590183ed61SJeremy L Thompson           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
7600183ed61SJeremy L Thompson           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
761b8245c6cSJeremy L Thompson                << "[i] = 0.0;\n";
7623a2968d6SJeremy L Thompson         } else {
7630183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
76474398b5aSJeremy L Thompson                << "];\n";
7653a2968d6SJeremy L Thompson         }
7663a2968d6SJeremy L Thompson         break;
7673a2968d6SJeremy L Thompson       case CEED_EVAL_GRAD:
7683a2968d6SJeremy L Thompson         if (is_at_points) {
7693a2968d6SJeremy L Thompson           // Accumulator for point data
7700183ed61SJeremy L Thompson           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
7710183ed61SJeremy L Thompson           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
772b8245c6cSJeremy L Thompson                << "[i] = 0.0;\n";
7733a2968d6SJeremy L Thompson         } else if (use_3d_slices) {
7744b3e95d5SJeremy L Thompson           // Accumulator for gradient slices
7750183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
7760183ed61SJeremy L Thompson           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
7774b3e95d5SJeremy L Thompson         } else {
7780183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
77974398b5aSJeremy L Thompson                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
7804b3e95d5SJeremy L Thompson         }
7813a2968d6SJeremy L Thompson         break;
7823a2968d6SJeremy L Thompson       case CEED_EVAL_WEIGHT:
7833a2968d6SJeremy L Thompson         break;
7843a2968d6SJeremy L Thompson         // LCOV_EXCL_START
7853a2968d6SJeremy L Thompson       case CEED_EVAL_DIV:
7863a2968d6SJeremy L Thompson       case CEED_EVAL_CURL:
7873a2968d6SJeremy L Thompson         break;  // TODO: Not implemented
7883a2968d6SJeremy L Thompson                 // LCOV_EXCL_STOP
7894b3e95d5SJeremy L Thompson     }
7904b3e95d5SJeremy L Thompson   }
7914b3e95d5SJeremy L Thompson 
7923a2968d6SJeremy L Thompson   if (is_at_points) {
7933a2968d6SJeremy L Thompson     // We need to handle batches of points
7940183ed61SJeremy L Thompson     code << "\n";
7950183ed61SJeremy L Thompson     code << tab << "// Note: Using batches of points\n";
7960183ed61SJeremy L Thompson     code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
7970183ed61SJeremy L Thompson     code << tab << "#pragma unroll\n";
7980183ed61SJeremy L Thompson     code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
7990183ed61SJeremy L Thompson     tab.push();
8000183ed61SJeremy L Thompson     code << tab << "const CeedInt p = i % max_num_points;\n\n";
8013a2968d6SJeremy L Thompson 
8020183ed61SJeremy L Thompson     code << tab << "// -- Coordinates\n";
8030183ed61SJeremy L Thompson     code << tab << "CeedScalar r_x[max_dim];\n";
8040183ed61SJeremy L Thompson     code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
8053a2968d6SJeremy L Thompson 
8060183ed61SJeremy L Thompson     code << tab << "// -- Input fields\n";
8073a2968d6SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
80859fa3f92SJeremy L Thompson       const char *field_name;
8093a2968d6SJeremy L Thompson       std::string var_suffix = "_in_" + std::to_string(i);
810f725b54bSJeremy L Thompson       std::string P_name     = "P_1d" + var_suffix;
8113a2968d6SJeremy L Thompson 
81259fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
8130183ed61SJeremy L Thompson       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
8143a2968d6SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
8153a2968d6SJeremy L Thompson       // Basis action
8160183ed61SJeremy L Thompson       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
8173a2968d6SJeremy L Thompson       switch (eval_mode) {
8183a2968d6SJeremy L Thompson         case CEED_EVAL_NONE:
8190183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
8200183ed61SJeremy L Thompson           code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
8213a2968d6SJeremy L Thompson                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
8223a2968d6SJeremy L Thompson           break;
8233a2968d6SJeremy L Thompson         case CEED_EVAL_INTERP:
8240183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
8250183ed61SJeremy L Thompson           code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
82674398b5aSJeremy L Thompson                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
8273a2968d6SJeremy L Thompson           break;
8283a2968d6SJeremy L Thompson         case CEED_EVAL_GRAD:
8290183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
8300183ed61SJeremy L Thompson           code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
83174398b5aSJeremy L Thompson                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
8323a2968d6SJeremy L Thompson           break;
8333a2968d6SJeremy L Thompson         case CEED_EVAL_WEIGHT:
8340183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
8350183ed61SJeremy L Thompson           code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
8363a2968d6SJeremy L Thompson           break;
8373a2968d6SJeremy L Thompson           // LCOV_EXCL_START
8383a2968d6SJeremy L Thompson         case CEED_EVAL_DIV:
8393a2968d6SJeremy L Thompson         case CEED_EVAL_CURL:
8403a2968d6SJeremy L Thompson           break;  // TODO: Not implemented
8413a2968d6SJeremy L Thompson                   // LCOV_EXCL_STOP
8423a2968d6SJeremy L Thompson       }
8433a2968d6SJeremy L Thompson     }
8440183ed61SJeremy L Thompson     code << "\n";
8450183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
8463a2968d6SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
84759fa3f92SJeremy L Thompson       const char *field_name;
8483a2968d6SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
8493a2968d6SJeremy L Thompson 
85059fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
8510183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
8523a2968d6SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
8533a2968d6SJeremy L Thompson       // Basis action
8543a2968d6SJeremy L Thompson       switch (eval_mode) {
8553a2968d6SJeremy L Thompson         case CEED_EVAL_NONE:
8560183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
8573a2968d6SJeremy L Thompson           break;
8583a2968d6SJeremy L Thompson         case CEED_EVAL_INTERP:
8590183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
8603a2968d6SJeremy L Thompson           break;
8613a2968d6SJeremy L Thompson         case CEED_EVAL_GRAD:
8620183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
8633a2968d6SJeremy L Thompson           break;
8643a2968d6SJeremy L Thompson           // LCOV_EXCL_START
8653a2968d6SJeremy L Thompson         case CEED_EVAL_WEIGHT:
8663a2968d6SJeremy L Thompson           break;  // Should not occur
8673a2968d6SJeremy L Thompson         case CEED_EVAL_DIV:
8683a2968d6SJeremy L Thompson         case CEED_EVAL_CURL:
8693a2968d6SJeremy L Thompson           break;  // TODO: Not implemented
8703a2968d6SJeremy L Thompson                   // LCOV_EXCL_STOP
8713a2968d6SJeremy L Thompson       }
8723a2968d6SJeremy L Thompson     }
8733a2968d6SJeremy L Thompson 
8743a2968d6SJeremy L Thompson   } else if (use_3d_slices) {
8754b3e95d5SJeremy L Thompson     // We treat quadrature points per slice in 3d to save registers
8760183ed61SJeremy L Thompson     code << "\n";
8770183ed61SJeremy L Thompson     code << tab << "// Note: Using planes of 3D elements\n";
8780183ed61SJeremy L Thompson     code << tab << "#pragma unroll\n";
8790183ed61SJeremy L Thompson     code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
8800183ed61SJeremy L Thompson     tab.push();
8810183ed61SJeremy L Thompson     code << tab << "// -- Input fields\n";
8824b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
88359fa3f92SJeremy L Thompson       const char *field_name;
8844b3e95d5SJeremy L Thompson       std::string var_suffix = "_in_" + std::to_string(i);
8854b3e95d5SJeremy L Thompson 
88659fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
8870183ed61SJeremy L Thompson       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
8884b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
8894b3e95d5SJeremy L Thompson       // Basis action
8900183ed61SJeremy L Thompson       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
8914b3e95d5SJeremy L Thompson       switch (eval_mode) {
8924b3e95d5SJeremy L Thompson         case CEED_EVAL_NONE:
8934b3e95d5SJeremy L Thompson           bool is_strided;
8944b3e95d5SJeremy L Thompson 
8950183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
8964b3e95d5SJeremy L Thompson 
8974b3e95d5SJeremy L Thompson           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
8984b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
8994b3e95d5SJeremy L Thompson           if (is_strided) {
9004b3e95d5SJeremy L Thompson             bool    has_backend_strides;
9014b3e95d5SJeremy L Thompson             CeedInt num_elem, elem_size;
9024b3e95d5SJeremy L Thompson 
9034b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
9044b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
9054b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
9064b3e95d5SJeremy L Thompson             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
9074b3e95d5SJeremy L Thompson 
9084b3e95d5SJeremy L Thompson             if (!has_backend_strides) {
9094b3e95d5SJeremy L Thompson               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
9104b3e95d5SJeremy L Thompson             }
9110183ed61SJeremy L Thompson             code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
9120183ed61SJeremy L Thompson                  << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
9130183ed61SJeremy L Thompson             code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
9140183ed61SJeremy L Thompson                  << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
9154b3e95d5SJeremy L Thompson           } else {
9164b3e95d5SJeremy L Thompson             CeedSize                 l_size = 0;
9174b3e95d5SJeremy L Thompson             CeedInt                  comp_stride;
9184b3e95d5SJeremy L Thompson             CeedElemRestriction_Hip *rstr_data;
9194b3e95d5SJeremy L Thompson 
9204b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
9210183ed61SJeremy L Thompson             code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
9224b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
9230183ed61SJeremy L Thompson             code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
9244b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
9254b3e95d5SJeremy L Thompson             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
9260183ed61SJeremy L Thompson             code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
9270183ed61SJeremy L Thompson                  << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
9284b3e95d5SJeremy L Thompson           }
9299123fb08SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
9304b3e95d5SJeremy L Thompson           break;
9314b3e95d5SJeremy L Thompson         case CEED_EVAL_INTERP:
9320183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
9330183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
9340183ed61SJeremy L Thompson           tab.push();
9350183ed61SJeremy L Thompson           code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
9360183ed61SJeremy L Thompson           tab.pop();
9370183ed61SJeremy L Thompson           code << tab << "}\n";
9384b3e95d5SJeremy L Thompson           break;
9394b3e95d5SJeremy L Thompson         case CEED_EVAL_GRAD:
9400183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
9410183ed61SJeremy L Thompson           code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
9426b92dc4bSJeremy L Thompson                << var_suffix << ", r_s" << var_suffix << ");\n";
9434b3e95d5SJeremy L Thompson           break;
9444b3e95d5SJeremy L Thompson         case CEED_EVAL_WEIGHT:
9450183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
9460183ed61SJeremy L Thompson           code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
9473a2968d6SJeremy L Thompson           break;
9484b3e95d5SJeremy L Thompson           // LCOV_EXCL_START
9494b3e95d5SJeremy L Thompson         case CEED_EVAL_DIV:
9504b3e95d5SJeremy L Thompson         case CEED_EVAL_CURL:
9514b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
9524b3e95d5SJeremy L Thompson                   // LCOV_EXCL_STOP
9534b3e95d5SJeremy L Thompson       }
9544b3e95d5SJeremy L Thompson     }
9550183ed61SJeremy L Thompson     code << "\n";
9560183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
9574b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
95859fa3f92SJeremy L Thompson       const char *field_name;
9594b3e95d5SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
9604b3e95d5SJeremy L Thompson 
96159fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
9620183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
9634b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
9644b3e95d5SJeremy L Thompson       // Basis action
9654b3e95d5SJeremy L Thompson       switch (eval_mode) {
9664b3e95d5SJeremy L Thompson         case CEED_EVAL_NONE:
9670183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
9683a2968d6SJeremy L Thompson           break;
9694b3e95d5SJeremy L Thompson         case CEED_EVAL_INTERP:
9700183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
9714b3e95d5SJeremy L Thompson           break;
9724b3e95d5SJeremy L Thompson         case CEED_EVAL_GRAD:
9730183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
9744b3e95d5SJeremy L Thompson           break;
9754b3e95d5SJeremy L Thompson           // LCOV_EXCL_START
9764b3e95d5SJeremy L Thompson         case CEED_EVAL_WEIGHT:
9774b3e95d5SJeremy L Thompson           break;  // Should not occur
9784b3e95d5SJeremy L Thompson         case CEED_EVAL_DIV:
9794b3e95d5SJeremy L Thompson         case CEED_EVAL_CURL:
9804b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
9814b3e95d5SJeremy L Thompson                   // LCOV_EXCL_STOP
9824b3e95d5SJeremy L Thompson       }
9834b3e95d5SJeremy L Thompson     }
9844b3e95d5SJeremy L Thompson   } else {
9850183ed61SJeremy L Thompson     code << "\n";
9860183ed61SJeremy L Thompson     code << tab << "// Note: Using full elements\n";
9870183ed61SJeremy L Thompson     code << tab << "{\n";
9880183ed61SJeremy L Thompson     tab.push();
9890183ed61SJeremy L Thompson     code << tab << "// -- Input fields\n";
9904b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
99159fa3f92SJeremy L Thompson       const char *field_name;
99259fa3f92SJeremy L Thompson 
99359fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
9940183ed61SJeremy L Thompson       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
9950183ed61SJeremy L Thompson       code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
9964b3e95d5SJeremy L Thompson     }
9970183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
9984b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
99959fa3f92SJeremy L Thompson       const char *field_name;
100059fa3f92SJeremy L Thompson 
100159fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
10020183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
10030183ed61SJeremy L Thompson       code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
10044b3e95d5SJeremy L Thompson     }
10054b3e95d5SJeremy L Thompson   }
10064b3e95d5SJeremy L Thompson 
10074b3e95d5SJeremy L Thompson   // Input and output buffers
10080183ed61SJeremy L Thompson   code << "\n";
10090183ed61SJeremy L Thompson   code << tab << "// -- QFunction inputs and outputs\n";
10100183ed61SJeremy L Thompson   code << tab << "// ---- Inputs\n";
10110183ed61SJeremy L Thompson   code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
10124b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
101359fa3f92SJeremy L Thompson     const char *field_name;
101459fa3f92SJeremy L Thompson 
101559fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
10160183ed61SJeremy L Thompson     code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
10170183ed61SJeremy L Thompson     code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
10184b3e95d5SJeremy L Thompson   }
10190183ed61SJeremy L Thompson   code << tab << "// ---- Outputs\n";
10200183ed61SJeremy L Thompson   code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
10214b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
102259fa3f92SJeremy L Thompson     const char *field_name;
102359fa3f92SJeremy L Thompson 
102459fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
10250183ed61SJeremy L Thompson     code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
10260183ed61SJeremy L Thompson     code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
10274b3e95d5SJeremy L Thompson   }
10284b3e95d5SJeremy L Thompson 
10294b3e95d5SJeremy L Thompson   // Apply QFunction
10300183ed61SJeremy L Thompson   code << "\n";
10310183ed61SJeremy L Thompson   code << tab << "// -- Apply QFunction\n";
10320183ed61SJeremy L Thompson   code << tab << "" << qfunction_name << "(ctx, ";
103374398b5aSJeremy L Thompson   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
10344b3e95d5SJeremy L Thompson     code << "1";
10354b3e95d5SJeremy L Thompson   } else {
10369123fb08SJeremy L Thompson     code << Q_name;
10374b3e95d5SJeremy L Thompson   }
10384b3e95d5SJeremy L Thompson   code << ", inputs, outputs);\n";
10394b3e95d5SJeremy L Thompson 
10403a2968d6SJeremy L Thompson   if (is_at_points) {
10413a2968d6SJeremy L Thompson     // Map back to coefficients
10420183ed61SJeremy L Thompson     code << "\n";
10430183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
10443a2968d6SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
104559fa3f92SJeremy L Thompson       const char *field_name;
10463a2968d6SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
10473a2968d6SJeremy L Thompson       std::string P_name     = "P_1d" + var_suffix;
10483a2968d6SJeremy L Thompson 
104959fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
10500183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
10513a2968d6SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
10523a2968d6SJeremy L Thompson       // Basis action
10530183ed61SJeremy L Thompson       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
10543a2968d6SJeremy L Thompson       switch (eval_mode) {
10553a2968d6SJeremy L Thompson         case CEED_EVAL_NONE: {
10563a2968d6SJeremy L Thompson           CeedInt             comp_stride;
10573a2968d6SJeremy L Thompson           CeedElemRestriction elem_rstr;
10583a2968d6SJeremy L Thompson 
1059*745f16d1SZach Atkins           if (is_assemble) break;
10603a2968d6SJeremy L Thompson           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
10613a2968d6SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
10623a2968d6SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
10630183ed61SJeremy L Thompson           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
10640183ed61SJeremy L Thompson           code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
10653a2968d6SJeremy L Thompson                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
10663a2968d6SJeremy L Thompson                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
10673a2968d6SJeremy L Thompson           break;
10683a2968d6SJeremy L Thompson         }
10693a2968d6SJeremy L Thompson         case CEED_EVAL_INTERP:
10700183ed61SJeremy L Thompson           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
10710183ed61SJeremy L Thompson           tab.push();
10720183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
10730183ed61SJeremy L Thompson           tab.pop();
10740183ed61SJeremy L Thompson           code << tab << "}\n";
10750183ed61SJeremy L Thompson           code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1076f725b54bSJeremy L Thompson                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
10773a2968d6SJeremy L Thompson           break;
10783a2968d6SJeremy L Thompson         case CEED_EVAL_GRAD:
10790183ed61SJeremy L Thompson           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
10800183ed61SJeremy L Thompson           tab.push();
10810183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
10820183ed61SJeremy L Thompson           tab.pop();
10830183ed61SJeremy L Thompson           code << tab << "}\n";
10840183ed61SJeremy L Thompson           code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1085f725b54bSJeremy L Thompson                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
10863a2968d6SJeremy L Thompson           break;
10873a2968d6SJeremy L Thompson           // LCOV_EXCL_START
10883a2968d6SJeremy L Thompson         case CEED_EVAL_WEIGHT:
10893a2968d6SJeremy L Thompson           break;  // Should not occur
10903a2968d6SJeremy L Thompson         case CEED_EVAL_DIV:
10913a2968d6SJeremy L Thompson         case CEED_EVAL_CURL:
10923a2968d6SJeremy L Thompson           break;  // TODO: Not implemented
10933a2968d6SJeremy L Thompson                   // LCOV_EXCL_STOP
10943a2968d6SJeremy L Thompson       }
10953a2968d6SJeremy L Thompson     }
10963a2968d6SJeremy L Thompson   } else if (use_3d_slices) {
10974b3e95d5SJeremy L Thompson     // Copy or apply transpose grad, if needed
10980183ed61SJeremy L Thompson     code << "\n";
10990183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
11004b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
110159fa3f92SJeremy L Thompson       const char *field_name;
11024b3e95d5SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
11034b3e95d5SJeremy L Thompson       std::string P_name     = "P_1d" + var_suffix;
11044b3e95d5SJeremy L Thompson 
110559fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
11060183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
11074b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
11084b3e95d5SJeremy L Thompson       // Basis action
11090183ed61SJeremy L Thompson       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
11104b3e95d5SJeremy L Thompson       switch (eval_mode) {
11114b3e95d5SJeremy L Thompson         case CEED_EVAL_NONE:
11120183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
11130183ed61SJeremy L Thompson           tab.push();
11140183ed61SJeremy L Thompson           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
11150183ed61SJeremy L Thompson           tab.pop();
11160183ed61SJeremy L Thompson           code << tab << "}\n";
11173a2968d6SJeremy L Thompson           break;
11184b3e95d5SJeremy L Thompson         case CEED_EVAL_INTERP:
11190183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
11200183ed61SJeremy L Thompson           tab.push();
11210183ed61SJeremy L Thompson           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
11220183ed61SJeremy L Thompson           tab.pop();
11230183ed61SJeremy L Thompson           code << tab << "}\n";
11244b3e95d5SJeremy L Thompson           break;
11254b3e95d5SJeremy L Thompson         case CEED_EVAL_GRAD:
11260183ed61SJeremy L Thompson           code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1127f815fac9SJeremy L Thompson                << var_suffix << ", r_q" << var_suffix << ");\n";
11284b3e95d5SJeremy L Thompson           break;
11294b3e95d5SJeremy L Thompson           // LCOV_EXCL_START
11304b3e95d5SJeremy L Thompson         case CEED_EVAL_WEIGHT:
11314b3e95d5SJeremy L Thompson           break;  // Should not occur
11324b3e95d5SJeremy L Thompson         case CEED_EVAL_DIV:
11334b3e95d5SJeremy L Thompson         case CEED_EVAL_CURL:
11344b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
11354b3e95d5SJeremy L Thompson                   // LCOV_EXCL_STOP
11364b3e95d5SJeremy L Thompson       }
11374b3e95d5SJeremy L Thompson     }
11384b3e95d5SJeremy L Thompson   }
11390183ed61SJeremy L Thompson   tab.pop();
11400183ed61SJeremy L Thompson   code << tab << "}\n";
11414b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
11424b3e95d5SJeremy L Thompson }
11434b3e95d5SJeremy L Thompson 
11444b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
11459e201c85SYohann // Build single operator kernel
11467d8d0e25Snbeams //------------------------------------------------------------------------------
CeedOperatorBuildKernel_Hip_gen(CeedOperator op,bool * is_good_build)11478d12f40eSJeremy L Thompson extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
114874398b5aSJeremy L Thompson   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
11497d8d0e25Snbeams   Ceed                   ceed;
1150efa41df3SJeremy L Thompson   CeedInt                Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1151b7453713SJeremy L Thompson   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1152b7453713SJeremy L Thompson   CeedQFunction_Hip_gen *qf_data;
1153b7453713SJeremy L Thompson   CeedQFunction          qf;
1154b7453713SJeremy L Thompson   CeedOperatorField     *op_input_fields, *op_output_fields;
1155b7453713SJeremy L Thompson   CeedOperator_Hip_gen  *data;
11564b3e95d5SJeremy L Thompson   std::ostringstream     code;
11570183ed61SJeremy L Thompson   Tab                    tab;
11584b3e95d5SJeremy L Thompson 
11598d12f40eSJeremy L Thompson   CeedCallBackend(CeedOperatorGetData(op, &data));
11604b3e95d5SJeremy L Thompson   {
11614b3e95d5SJeremy L Thompson     bool is_setup_done;
1162b7453713SJeremy L Thompson 
1163b7453713SJeremy L Thompson     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
11648d12f40eSJeremy L Thompson     if (is_setup_done) {
11658d12f40eSJeremy L Thompson       *is_good_build = !data->use_fallback;
11668d12f40eSJeremy L Thompson       return CEED_ERROR_SUCCESS;
11678d12f40eSJeremy L Thompson     }
11684b3e95d5SJeremy L Thompson   }
1169b7453713SJeremy L Thompson 
11708d12f40eSJeremy L Thompson   // Check field compatibility
11718d12f40eSJeremy L Thompson   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
11728d12f40eSJeremy L Thompson   {
117374398b5aSJeremy L Thompson     bool has_shared_bases = true;
11748d12f40eSJeremy L Thompson 
11758d12f40eSJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
11768d12f40eSJeremy L Thompson       CeedBasis basis;
11778d12f40eSJeremy L Thompson 
11788d12f40eSJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
11798d12f40eSJeremy L Thompson       if (basis != CEED_BASIS_NONE) {
11808d12f40eSJeremy L Thompson         bool        is_tensor = true;
11818d12f40eSJeremy L Thompson         const char *resource;
11828d12f40eSJeremy L Thompson         char       *resource_root;
11838d12f40eSJeremy L Thompson         Ceed        basis_ceed;
11848d12f40eSJeremy L Thompson 
11858d12f40eSJeremy L Thompson         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1186c9192acaSJeremy L Thompson         is_all_tensor    = is_all_tensor && is_tensor;
1187c9192acaSJeremy L Thompson         is_all_nontensor = is_all_nontensor && !is_tensor;
11888d12f40eSJeremy L Thompson         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
11898d12f40eSJeremy L Thompson         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
11908d12f40eSJeremy L Thompson         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1191c9192acaSJeremy L Thompson         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
11928d12f40eSJeremy L Thompson         CeedCallBackend(CeedFree(&resource_root));
11938d12f40eSJeremy L Thompson         CeedCallBackend(CeedDestroy(&basis_ceed));
11948d12f40eSJeremy L Thompson       }
11958d12f40eSJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis));
11968d12f40eSJeremy L Thompson     }
11978d12f40eSJeremy L Thompson 
11988d12f40eSJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
11998d12f40eSJeremy L Thompson       CeedBasis basis;
12008d12f40eSJeremy L Thompson 
12018d12f40eSJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
12028d12f40eSJeremy L Thompson       if (basis != CEED_BASIS_NONE) {
12038d12f40eSJeremy L Thompson         bool        is_tensor = true;
12048d12f40eSJeremy L Thompson         const char *resource;
12058d12f40eSJeremy L Thompson         char       *resource_root;
12068d12f40eSJeremy L Thompson         Ceed        basis_ceed;
12078d12f40eSJeremy L Thompson 
12088d12f40eSJeremy L Thompson         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1209c9192acaSJeremy L Thompson         is_all_tensor    = is_all_tensor && is_tensor;
1210c9192acaSJeremy L Thompson         is_all_nontensor = is_all_nontensor && !is_tensor;
12118d12f40eSJeremy L Thompson 
12128d12f40eSJeremy L Thompson         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
12138d12f40eSJeremy L Thompson         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
12148d12f40eSJeremy L Thompson         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1215c9192acaSJeremy L Thompson         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
12168d12f40eSJeremy L Thompson         CeedCallBackend(CeedFree(&resource_root));
12178d12f40eSJeremy L Thompson         CeedCallBackend(CeedDestroy(&basis_ceed));
12188d12f40eSJeremy L Thompson       }
12198d12f40eSJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis));
12208d12f40eSJeremy L Thompson     }
12218d12f40eSJeremy L Thompson     // -- Fallback to ref if not all bases are shared
122274398b5aSJeremy L Thompson     if (!has_shared_bases) {
12238d12f40eSJeremy L Thompson       *is_good_build = false;
12248d12f40eSJeremy L Thompson       return CEED_ERROR_SUCCESS;
12258d12f40eSJeremy L Thompson     }
12268d12f40eSJeremy L Thompson   }
1227b7453713SJeremy L Thompson   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1228b7453713SJeremy L Thompson   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1229b7453713SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1230b7453713SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
12317d8d0e25Snbeams 
12324b3e95d5SJeremy L Thompson   // Get operator data
12333a2968d6SJeremy L Thompson   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
123474398b5aSJeremy L Thompson   {
1235efa41df3SJeremy L Thompson     CeedInt max_P = 0, max_P_1d = 0;
123674398b5aSJeremy L Thompson 
12374b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
123874398b5aSJeremy L Thompson                                                         qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));
123974398b5aSJeremy L Thompson     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
124074398b5aSJeremy L Thompson   }
12413a2968d6SJeremy L Thompson   if (is_at_points) {
1242e31b7a9fSJeremy L Thompson     CeedInt                  coords_dim = 0;
12433a2968d6SJeremy L Thompson     CeedElemRestriction_Hip *rstr_data;
12443a2968d6SJeremy L Thompson     CeedElemRestriction      rstr_points = NULL;
12454b3e95d5SJeremy L Thompson 
12463a2968d6SJeremy L Thompson     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
12473a2968d6SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
12483a2968d6SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1249e31b7a9fSJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr_points, &coords_dim));
12503a2968d6SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
12513a2968d6SJeremy L Thompson     data->points.indices = (CeedInt *)rstr_data->d_offsets;
12523a2968d6SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1253e31b7a9fSJeremy L Thompson     if (max_dim == 0) max_dim = coords_dim;
1254e31b7a9fSJeremy L Thompson     if (Q_1d == 0) max_num_points = ceil(pow(max_num_points, 1.0 / max_dim));
12553a2968d6SJeremy L Thompson   }
1256e31b7a9fSJeremy L Thompson   if (max_dim == 0) max_dim = 1;
1257e31b7a9fSJeremy L Thompson   data->dim = max_dim;
12583a2968d6SJeremy L Thompson   if (is_at_points) use_3d_slices = false;
12593a2968d6SJeremy L Thompson   if (Q_1d == 0) {
12603a2968d6SJeremy L Thompson     if (is_at_points) Q_1d = max_num_points;
12613a2968d6SJeremy L Thompson     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
12624b3e95d5SJeremy L Thompson   }
126374398b5aSJeremy L Thompson   if (Q == 0) Q = Q_1d;
126474398b5aSJeremy L Thompson   data->Q    = Q;
12654b3e95d5SJeremy L Thompson   data->Q_1d = Q_1d;
12664b3e95d5SJeremy L Thompson 
12670b454692Sjeremylt   // Check for restriction only identity operator
12684b3e95d5SJeremy L Thompson   {
12694b3e95d5SJeremy L Thompson     bool is_identity_qf;
12704b3e95d5SJeremy L Thompson 
12712b730f8bSJeremy L Thompson     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
12720b454692Sjeremylt     if (is_identity_qf) {
12739e201c85SYohann       CeedEvalMode eval_mode_in, eval_mode_out;
1274b7453713SJeremy L Thompson 
12752b730f8bSJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
12762b730f8bSJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
12776574a04fSJeremy L Thompson       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
12786574a04fSJeremy L Thompson                 "Backend does not implement restriction only identity operators");
12790b454692Sjeremylt     }
12804b3e95d5SJeremy L Thompson   }
1281b2165e7aSSebastian Grimberg 
1282b2165e7aSSebastian Grimberg   // Load basis source files
1283eaf9ad10SZach Atkins   if (!is_all_nontensor) {
12840183ed61SJeremy L Thompson     code << tab << "// Tensor basis source\n";
12850183ed61SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
128674398b5aSJeremy L Thompson   }
128774398b5aSJeremy L Thompson   if (!is_all_tensor) {
12880183ed61SJeremy L Thompson     code << tab << "// Non-tensor basis source\n";
12890183ed61SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
12909123fb08SJeremy L Thompson   }
12919123fb08SJeremy L Thompson   if (is_at_points) {
12920183ed61SJeremy L Thompson     code << tab << "// AtPoints basis source\n";
12930183ed61SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
12949123fb08SJeremy L Thompson   }
129574398b5aSJeremy L Thompson   if (!is_all_tensor && !is_all_nontensor) {
12960183ed61SJeremy L Thompson     code << tab << "// Tensor basis source\n";
12970183ed61SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
129874398b5aSJeremy L Thompson   }
12990183ed61SJeremy L Thompson   code << tab << "// CodeGen operator source\n";
13000183ed61SJeremy L Thompson   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
13017d8d0e25Snbeams 
13024b3e95d5SJeremy L Thompson   // Get QFunction name
13034b3e95d5SJeremy L Thompson   std::string qfunction_name(qf_data->qfunction_name);
13044b3e95d5SJeremy L Thompson   std::string operator_name;
13054b3e95d5SJeremy L Thompson 
130609095acaSJeremy L Thompson   operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
13077d8d0e25Snbeams 
13089e201c85SYohann   // Define CEED_Q_VLA
13090183ed61SJeremy L Thompson   code << "\n" << tab << "#undef CEED_Q_VLA\n";
131074398b5aSJeremy L Thompson   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
13110183ed61SJeremy L Thompson     code << tab << "#define CEED_Q_VLA 1\n\n";
13129e201c85SYohann   } else {
13130183ed61SJeremy L Thompson     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
13149e201c85SYohann   }
13159e201c85SYohann 
13164b3e95d5SJeremy L Thompson   // Add user QFunction source
13174b3e95d5SJeremy L Thompson   {
13189c25dd66SJeremy L Thompson     const char *source_path;
13194b3e95d5SJeremy L Thompson 
13209c25dd66SJeremy L Thompson     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
13219c25dd66SJeremy L Thompson     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
13229c25dd66SJeremy L Thompson 
13230183ed61SJeremy L Thompson     code << tab << "// User QFunction source\n";
13240183ed61SJeremy L Thompson     code << tab << "#include \"" << source_path << "\"\n\n";
13254b3e95d5SJeremy L Thompson   }
13267d8d0e25Snbeams 
13277d8d0e25Snbeams   // Setup
13280183ed61SJeremy L Thompson   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
13290183ed61SJeremy L Thompson   code << tab << "// Operator Kernel\n";
13300183ed61SJeremy L Thompson   code << tab << "// \n";
13310183ed61SJeremy L Thompson   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
13320183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_e_i: Element vector register\n";
13330183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
13340183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
13350183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
13360183ed61SJeremy L Thompson   code << tab << "// \n";
13370183ed61SJeremy L Thompson   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
13380183ed61SJeremy L Thompson   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
13390183ed61SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n";
13400183ed61SJeremy L Thompson   code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
13412b730f8bSJeremy L Thompson   code << "__global__ void " << operator_name
13423a2968d6SJeremy L Thompson        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
13430183ed61SJeremy L Thompson   tab.push();
13444b3e95d5SJeremy L Thompson 
13454b3e95d5SJeremy L Thompson   // Scratch buffers
13469e201c85SYohann   for (CeedInt i = 0; i < num_input_fields; i++) {
13474b3e95d5SJeremy L Thompson     CeedEvalMode eval_mode;
13484b3e95d5SJeremy L Thompson 
13492b730f8bSJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
13509e201c85SYohann     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
13510183ed61SJeremy L Thompson       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
13527d8d0e25Snbeams     }
13537d8d0e25Snbeams   }
13549e201c85SYohann   for (CeedInt i = 0; i < num_output_fields; i++) {
13550183ed61SJeremy L Thompson     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
13567d8d0e25Snbeams   }
13577d8d0e25Snbeams 
13580183ed61SJeremy L Thompson   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
135974398b5aSJeremy L Thompson   if (!is_all_tensor) {
13600183ed61SJeremy L Thompson     code << tab << "const CeedInt Q = " << Q << ";\n";
136174398b5aSJeremy L Thompson   }
136274398b5aSJeremy L Thompson   if (!is_all_nontensor) {
13630183ed61SJeremy L Thompson     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
136474398b5aSJeremy L Thompson   }
13653a2968d6SJeremy L Thompson   if (is_at_points) {
13660183ed61SJeremy L Thompson     code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
13670183ed61SJeremy L Thompson     code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
13683a2968d6SJeremy L Thompson   }
13697d8d0e25Snbeams 
13704b3e95d5SJeremy L Thompson   // Shared data
13710183ed61SJeremy L Thompson   code << tab << "extern __shared__ CeedScalar slice[];\n";
13720183ed61SJeremy L Thompson   code << tab << "SharedData_Hip data;\n";
13730183ed61SJeremy L Thompson   code << tab << "data.t_id_x = threadIdx.x;\n";
13740183ed61SJeremy L Thompson   code << tab << "data.t_id_y = threadIdx.y;\n";
13750183ed61SJeremy L Thompson   code << tab << "data.t_id_z = threadIdx.z;\n";
13760183ed61SJeremy L Thompson   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
13770183ed61SJeremy L Thompson   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
13787d8d0e25Snbeams 
13799ee499e5SJeremy L Thompson   // -- Determine input mat reuse
138045a787f7SJeremy L Thompson   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
13819ee499e5SJeremy L Thompson 
13829ee499e5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
138345a787f7SJeremy L Thompson     input_matrix_reuse[i].index = -1;
13849ee499e5SJeremy L Thompson   }
13859ee499e5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
138674398b5aSJeremy L Thompson     bool         is_tensor = true;
13879ee499e5SJeremy L Thompson     CeedEvalMode eval_mode_i;
13889ee499e5SJeremy L Thompson     CeedBasis    basis_i;
13899ee499e5SJeremy L Thompson 
13909ee499e5SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
13919ee499e5SJeremy L Thompson     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
13929ee499e5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
139374398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
139445a787f7SJeremy L Thompson     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
13959ee499e5SJeremy L Thompson       CeedEvalMode eval_mode_j;
13969ee499e5SJeremy L Thompson       CeedBasis    basis_j;
13979ee499e5SJeremy L Thompson 
13989ee499e5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
13999ee499e5SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
14009ee499e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
14019ee499e5SJeremy L Thompson       if (basis_i == basis_j) {
14029ee499e5SJeremy L Thompson         if (is_tensor) {
140345a787f7SJeremy L Thompson           input_matrix_reuse[i].index     = j;
140445a787f7SJeremy L Thompson           input_matrix_reuse[i].is_input  = true;
140545a787f7SJeremy L Thompson           input_matrix_reuse[i].eval_mode = eval_mode_j;
14069ee499e5SJeremy L Thompson         } else {
14079ee499e5SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
14089ee499e5SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
140945a787f7SJeremy L Thompson             input_matrix_reuse[i].index     = j;
141045a787f7SJeremy L Thompson             input_matrix_reuse[i].is_input  = true;
141145a787f7SJeremy L Thompson             input_matrix_reuse[i].eval_mode = eval_mode_j;
14129ee499e5SJeremy L Thompson           }
14139ee499e5SJeremy L Thompson         }
14149ee499e5SJeremy L Thompson       }
14159ee499e5SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
14169ee499e5SJeremy L Thompson     }
14179ee499e5SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
14189ee499e5SJeremy L Thompson   }
14199ee499e5SJeremy L Thompson 
14209ee499e5SJeremy L Thompson   // -- Determine output mat reuse
142145a787f7SJeremy L Thompson   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
14229ee499e5SJeremy L Thompson 
14239ee499e5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
142445a787f7SJeremy L Thompson     output_matrix_reuse[i].index = -1;
14259ee499e5SJeremy L Thompson   }
14269ee499e5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
142774398b5aSJeremy L Thompson     bool         is_tensor = true;
14289ee499e5SJeremy L Thompson     CeedEvalMode eval_mode_i;
14299ee499e5SJeremy L Thompson     CeedBasis    basis_i;
14309ee499e5SJeremy L Thompson 
14319ee499e5SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
14329ee499e5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
143345a787f7SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
14349ee499e5SJeremy L Thompson       CeedEvalMode eval_mode_j;
14359ee499e5SJeremy L Thompson       CeedBasis    basis_j;
14369ee499e5SJeremy L Thompson 
14379ee499e5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
14389ee499e5SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
14399ee499e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
14409ee499e5SJeremy L Thompson       if (basis_i == basis_j) {
14419ee499e5SJeremy L Thompson         if (is_tensor) {
144245a787f7SJeremy L Thompson           output_matrix_reuse[i].index     = j;
144345a787f7SJeremy L Thompson           output_matrix_reuse[i].is_input  = true;
144445a787f7SJeremy L Thompson           output_matrix_reuse[i].eval_mode = eval_mode_j;
14459ee499e5SJeremy L Thompson         } else {
14469ee499e5SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
14479ee499e5SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
144845a787f7SJeremy L Thompson             output_matrix_reuse[i].index     = j;
144945a787f7SJeremy L Thompson             output_matrix_reuse[i].is_input  = true;
145045a787f7SJeremy L Thompson             output_matrix_reuse[i].eval_mode = eval_mode_j;
14519ee499e5SJeremy L Thompson           }
14529ee499e5SJeremy L Thompson         }
14539ee499e5SJeremy L Thompson       }
14549ee499e5SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
14559ee499e5SJeremy L Thompson     }
145645a787f7SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
14579ee499e5SJeremy L Thompson       CeedEvalMode eval_mode_j;
14589ee499e5SJeremy L Thompson       CeedBasis    basis_j;
14599ee499e5SJeremy L Thompson 
14609ee499e5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
14619ee499e5SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
14629ee499e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
146374398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
14649ee499e5SJeremy L Thompson       if (basis_i == basis_j) {
14659ee499e5SJeremy L Thompson         if (is_tensor) {
146645a787f7SJeremy L Thompson           output_matrix_reuse[i].index     = j;
146745a787f7SJeremy L Thompson           output_matrix_reuse[i].is_input  = false;
146845a787f7SJeremy L Thompson           output_matrix_reuse[i].eval_mode = eval_mode_j;
14699ee499e5SJeremy L Thompson         } else {
14709ee499e5SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
14719ee499e5SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
147245a787f7SJeremy L Thompson             output_matrix_reuse[i].index     = j;
147345a787f7SJeremy L Thompson             output_matrix_reuse[i].is_input  = false;
147445a787f7SJeremy L Thompson             output_matrix_reuse[i].eval_mode = eval_mode_j;
14759ee499e5SJeremy L Thompson           }
14769ee499e5SJeremy L Thompson         }
14779ee499e5SJeremy L Thompson       }
14789ee499e5SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
14799ee499e5SJeremy L Thompson     }
14809ee499e5SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
14819ee499e5SJeremy L Thompson   }
14829ee499e5SJeremy L Thompson 
14837d8d0e25Snbeams   // Initialize constants, and matrices B and G
14840183ed61SJeremy L Thompson   code << "\n" << tab << "// Input field constants and basis data\n";
14859e201c85SYohann   for (CeedInt i = 0; i < num_input_fields; i++) {
14860183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1487ca1da9b9SJeremy L Thompson                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
14887d8d0e25Snbeams   }
14890183ed61SJeremy L Thompson   code << "\n" << tab << "// Output field constants and basis data\n";
14909e201c85SYohann   for (CeedInt i = 0; i < num_output_fields; i++) {
14910183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1492ca1da9b9SJeremy L Thompson                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
14934b3e95d5SJeremy L Thompson   }
14947d8d0e25Snbeams 
14954b3e95d5SJeremy L Thompson   // Loop over all elements
14960183ed61SJeremy L Thompson   code << "\n" << tab << "// Element loop\n";
14970183ed61SJeremy L Thompson   code << tab << "__syncthreads();\n";
14980183ed61SJeremy L Thompson   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
14990183ed61SJeremy L Thompson   tab.push();
15004b3e95d5SJeremy L Thompson 
1501e93651e5SJeremy L Thompson   // -- Compute minimum buffer space needed
15023a2968d6SJeremy L Thompson   CeedInt max_rstr_buffer_size = 1;
1503e93651e5SJeremy L Thompson 
1504e93651e5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
15056de40545SJeremy L Thompson     CeedEvalMode eval_mode;
15066de40545SJeremy L Thompson 
15076de40545SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
15086de40545SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1509a61b1c91SJeremy L Thompson       CeedInt             num_comp;
1510e93651e5SJeremy L Thompson       CeedElemRestriction elem_rstr;
1511e93651e5SJeremy L Thompson 
1512e93651e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1513e93651e5SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1514a61b1c91SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1515681d0ea7SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1516e93651e5SJeremy L Thompson     }
15176de40545SJeremy L Thompson   }
1518e93651e5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
15196de40545SJeremy L Thompson     CeedEvalMode eval_mode;
15206de40545SJeremy L Thompson 
15216de40545SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
15226de40545SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE) {
1523a61b1c91SJeremy L Thompson       CeedInt             num_comp;
1524e93651e5SJeremy L Thompson       CeedElemRestriction elem_rstr;
1525e93651e5SJeremy L Thompson 
1526e93651e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1527e93651e5SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1528a61b1c91SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1529681d0ea7SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1530e93651e5SJeremy L Thompson     }
15316de40545SJeremy L Thompson   }
15320183ed61SJeremy L Thompson   code << tab << "// Scratch restriction buffer space\n";
15330183ed61SJeremy L Thompson   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1534e93651e5SJeremy L Thompson 
1535e93651e5SJeremy L Thompson   // -- Determine best input field processing order
1536e93651e5SJeremy L Thompson   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1537e93651e5SJeremy L Thompson 
1538e93651e5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
1539e93651e5SJeremy L Thompson     field_rstr_in_buffer[i] = -1;
1540e93651e5SJeremy L Thompson     input_field_order[i]    = -1;
1541e93651e5SJeremy L Thompson   }
1542e93651e5SJeremy L Thompson   {
1543e93651e5SJeremy L Thompson     bool    is_ordered[CEED_FIELD_MAX];
1544e93651e5SJeremy L Thompson     CeedInt curr_index = 0;
1545e93651e5SJeremy L Thompson 
1546e93651e5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1547e93651e5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
1548e93651e5SJeremy L Thompson       CeedVector          vec_i;
1549e93651e5SJeremy L Thompson       CeedElemRestriction rstr_i;
1550e93651e5SJeremy L Thompson 
1551e93651e5SJeremy L Thompson       if (is_ordered[i]) continue;
1552e93651e5SJeremy L Thompson       field_rstr_in_buffer[i]       = i;
1553e93651e5SJeremy L Thompson       is_ordered[i]                 = true;
1554e93651e5SJeremy L Thompson       input_field_order[curr_index] = i;
1555e93651e5SJeremy L Thompson       curr_index++;
1556034f99fdSJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1557e93651e5SJeremy L Thompson       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1558e93651e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1559e93651e5SJeremy L Thompson       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1560e93651e5SJeremy L Thompson         CeedVector          vec_j;
1561e93651e5SJeremy L Thompson         CeedElemRestriction rstr_j;
1562e93651e5SJeremy L Thompson 
1563e93651e5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1564e93651e5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1565e93651e5SJeremy L Thompson         if (rstr_i == rstr_j && vec_i == vec_j) {
1566e93651e5SJeremy L Thompson           field_rstr_in_buffer[j]       = i;
1567e93651e5SJeremy L Thompson           is_ordered[j]                 = true;
1568e93651e5SJeremy L Thompson           input_field_order[curr_index] = j;
1569e93651e5SJeremy L Thompson           curr_index++;
1570e93651e5SJeremy L Thompson         }
15713a2968d6SJeremy L Thompson         CeedCallBackend(CeedVectorDestroy(&vec_j));
15723a2968d6SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1573e93651e5SJeremy L Thompson       }
15743a2968d6SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec_i));
15753a2968d6SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1576e93651e5SJeremy L Thompson     }
1577e93651e5SJeremy L Thompson   }
1578e93651e5SJeremy L Thompson 
15794b3e95d5SJeremy L Thompson   // -- Input restriction and basis
15800183ed61SJeremy L Thompson   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
15819e201c85SYohann   for (CeedInt i = 0; i < num_input_fields; i++) {
158259fa3f92SJeremy L Thompson     const char   *field_name;
158359fa3f92SJeremy L Thompson     const CeedInt f = input_field_order[i];
1584e93651e5SJeremy L Thompson 
158559fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
15860183ed61SJeremy L Thompson     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
15877d8d0e25Snbeams 
15884b3e95d5SJeremy L Thompson     // ---- Restriction
15890183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
15900183ed61SJeremy L Thompson                                                                max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1591b7453713SJeremy L Thompson 
15924b3e95d5SJeremy L Thompson     // ---- Basis action
15930183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
15940183ed61SJeremy L Thompson                                                          is_all_tensor, is_at_points, use_3d_slices));
15957d8d0e25Snbeams   }
15967d8d0e25Snbeams 
15974b3e95d5SJeremy L Thompson   // -- Q function
15980183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
15990183ed61SJeremy L Thompson                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1600*745f16d1SZach Atkins                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices, false));
16017d8d0e25Snbeams 
16024b3e95d5SJeremy L Thompson   // -- Output basis and restriction
16030183ed61SJeremy L Thompson   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
16049e201c85SYohann   for (CeedInt i = 0; i < num_output_fields; i++) {
160559fa3f92SJeremy L Thompson     const char *field_name;
160659fa3f92SJeremy L Thompson 
160759fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
16080183ed61SJeremy L Thompson     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1609b7453713SJeremy L Thompson 
16104b3e95d5SJeremy L Thompson     // ---- Basis action
16110183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
16120183ed61SJeremy L Thompson                                                          is_all_tensor, is_at_points, use_3d_slices));
16137d8d0e25Snbeams 
16144b3e95d5SJeremy L Thompson     // ---- Restriction
16150183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
16160183ed61SJeremy L Thompson                                                                false, is_all_tensor, is_at_points, use_3d_slices));
16177d8d0e25Snbeams   }
16187d8d0e25Snbeams 
16194b3e95d5SJeremy L Thompson   // Close loop and function
16200183ed61SJeremy L Thompson   tab.pop();
16210183ed61SJeremy L Thompson   code << tab << "}\n";
16220183ed61SJeremy L Thompson   tab.pop();
16230183ed61SJeremy L Thompson   code << tab << "}\n";
16240183ed61SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n\n";
16257d8d0e25Snbeams 
1626539ec17dSJeremy L Thompson   CeedInt block_sizes[3] = {0, 0, 0};
16279e201c85SYohann   CeedInt num_elem;
1628b7453713SJeremy L Thompson 
16293a2968d6SJeremy L Thompson   // Compile
16302b730f8bSJeremy L Thompson   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
163174398b5aSJeremy L Thompson   CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
16328d12f40eSJeremy L Thompson   {
16338d12f40eSJeremy L Thompson     bool is_compile_good = false;
16348d12f40eSJeremy L Thompson 
1635a61b1c91SJeremy L Thompson     data->thread_1d = block_sizes[0];
16366b92dc4bSJeremy L Thompson     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE",
16372b730f8bSJeremy L Thompson                                        block_sizes[0] * block_sizes[1] * block_sizes[2]));
16388d12f40eSJeremy L Thompson     if (is_compile_good) {
16398d12f40eSJeremy L Thompson       *is_good_build = true;
1640eb7e6cafSJeremy L Thompson       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
16418d12f40eSJeremy L Thompson     } else {
16428d12f40eSJeremy L Thompson       *is_good_build     = false;
16438d12f40eSJeremy L Thompson       data->use_fallback = true;
16448d12f40eSJeremy L Thompson     }
16458d12f40eSJeremy L Thompson   }
16462b730f8bSJeremy L Thompson   CeedCallBackend(CeedOperatorSetSetupDone(op));
16479bc66399SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
1648c11e12f4SJeremy L Thompson   CeedCallBackend(CeedQFunctionDestroy(&qf));
1649e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
16507d8d0e25Snbeams }
16512a86cc9dSSebastian Grimberg 
16527d8d0e25Snbeams //------------------------------------------------------------------------------
16530183ed61SJeremy L Thompson // Build AtPoints assembly operator kernel
16540183ed61SJeremy L Thompson //------------------------------------------------------------------------------
CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op,bool is_full,bool * is_good_build)16550183ed61SJeremy L Thompson static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) {
16560183ed61SJeremy L Thompson   bool                   is_all_tensor = true, is_at_points = false, use_3d_slices = false;
16570183ed61SJeremy L Thompson   Ceed                   ceed;
16580183ed61SJeremy L Thompson   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
16590183ed61SJeremy L Thompson   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
16600183ed61SJeremy L Thompson   CeedQFunction_Hip_gen *qf_data;
16610183ed61SJeremy L Thompson   CeedQFunction          qf;
16620183ed61SJeremy L Thompson   CeedOperatorField     *op_input_fields, *op_output_fields;
16630183ed61SJeremy L Thompson   CeedOperator_Hip_gen  *data;
16640183ed61SJeremy L Thompson   std::ostringstream     code;
16650183ed61SJeremy L Thompson   Tab                    tab;
16660183ed61SJeremy L Thompson 
16670183ed61SJeremy L Thompson   // Check compatibility
16680183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
16690183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
16700183ed61SJeremy L Thompson   CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
16710183ed61SJeremy L Thompson 
16720183ed61SJeremy L Thompson   // Retrieve operator data
16730183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetData(op, &data));
16740183ed61SJeremy L Thompson   Q       = data->Q;
16750183ed61SJeremy L Thompson   Q_1d    = data->Q_1d;
16760183ed61SJeremy L Thompson   max_dim = data->dim;
16770183ed61SJeremy L Thompson   {
16780183ed61SJeremy L Thompson     CeedElemRestriction rstr_points = NULL;
16790183ed61SJeremy L Thompson 
16800183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
16810183ed61SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
16820183ed61SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
16830183ed61SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
16840183ed61SJeremy L Thompson   }
16850183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
16860183ed61SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
16870183ed61SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
16880183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
16890183ed61SJeremy L Thompson 
16900183ed61SJeremy L Thompson   // Load basis source files
16910183ed61SJeremy L Thompson   code << tab << "// Tensor basis source\n";
16920183ed61SJeremy L Thompson   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
16930183ed61SJeremy L Thompson   code << tab << "// AtPoints basis source\n";
16940183ed61SJeremy L Thompson   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
16950183ed61SJeremy L Thompson   code << tab << "// CodeGen operator source\n";
16960183ed61SJeremy L Thompson   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
16970183ed61SJeremy L Thompson 
16980183ed61SJeremy L Thompson   // Get QFunction name
16990183ed61SJeremy L Thompson   std::string qfunction_name(qf_data->qfunction_name);
17000183ed61SJeremy L Thompson   std::string operator_name;
17010183ed61SJeremy L Thompson 
17020183ed61SJeremy L Thompson   if (is_full) {
17030183ed61SJeremy L Thompson     operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name;
17040183ed61SJeremy L Thompson   } else {
17050183ed61SJeremy L Thompson     operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name;
17060183ed61SJeremy L Thompson   }
17070183ed61SJeremy L Thompson 
17080183ed61SJeremy L Thompson   // Define CEED_Q_VLA
17090183ed61SJeremy L Thompson   code << "\n" << tab << "#undef CEED_Q_VLA\n";
17100183ed61SJeremy L Thompson   code << tab << "#define CEED_Q_VLA 1\n\n";
17110183ed61SJeremy L Thompson 
17120183ed61SJeremy L Thompson   // Add user QFunction source
17130183ed61SJeremy L Thompson   {
17140183ed61SJeremy L Thompson     const char *source_path;
17150183ed61SJeremy L Thompson 
17160183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
17170183ed61SJeremy L Thompson     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
17180183ed61SJeremy L Thompson 
17190183ed61SJeremy L Thompson     code << tab << "// User QFunction source\n";
17200183ed61SJeremy L Thompson     code << tab << "#include \"" << source_path << "\"\n\n";
17210183ed61SJeremy L Thompson   }
17220183ed61SJeremy L Thompson 
17230183ed61SJeremy L Thompson   // Setup
17240183ed61SJeremy L Thompson   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
17250183ed61SJeremy L Thompson   code << tab << "// Operator Assembly Kernel\n";
17260183ed61SJeremy L Thompson   code << tab << "// \n";
17270183ed61SJeremy L Thompson   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
17280183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_e_i: Element vector register\n";
17290183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
17300183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
17310183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
17320183ed61SJeremy L Thompson   code << tab << "// \n";
17330183ed61SJeremy L Thompson   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
17340183ed61SJeremy L Thompson   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
17350183ed61SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n";
17360183ed61SJeremy L Thompson   code << tab << "extern \"C\" __global__ void " << operator_name
17370183ed61SJeremy L Thompson        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
17380183ed61SJeremy L Thompson           "points, CeedScalar *__restrict__ values_array) {\n";
17390183ed61SJeremy L Thompson   tab.push();
17400183ed61SJeremy L Thompson 
17410183ed61SJeremy L Thompson   // Scratch buffers
17420183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
17430183ed61SJeremy L Thompson     CeedEvalMode eval_mode;
17440183ed61SJeremy L Thompson 
17450183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
17460183ed61SJeremy L Thompson     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
17470183ed61SJeremy L Thompson       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
17480183ed61SJeremy L Thompson     }
17490183ed61SJeremy L Thompson   }
17500183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
17510183ed61SJeremy L Thompson     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
17520183ed61SJeremy L Thompson   }
17530183ed61SJeremy L Thompson 
17540183ed61SJeremy L Thompson   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
17550183ed61SJeremy L Thompson   code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
17560183ed61SJeremy L Thompson   code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
17570183ed61SJeremy L Thompson   code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
17580183ed61SJeremy L Thompson 
17590183ed61SJeremy L Thompson   // Shared data
17600183ed61SJeremy L Thompson   code << tab << "extern __shared__ CeedScalar slice[];\n";
17610183ed61SJeremy L Thompson   code << tab << "SharedData_Hip data;\n";
17620183ed61SJeremy L Thompson   code << tab << "data.t_id_x = threadIdx.x;\n";
17630183ed61SJeremy L Thompson   code << tab << "data.t_id_y = threadIdx.y;\n";
17640183ed61SJeremy L Thompson   code << tab << "data.t_id_z = threadIdx.z;\n";
17650183ed61SJeremy L Thompson   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
17660183ed61SJeremy L Thompson   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
17670183ed61SJeremy L Thompson 
17680183ed61SJeremy L Thompson   // -- Determine input mat reuse
17690183ed61SJeremy L Thompson   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
17700183ed61SJeremy L Thompson 
17710183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
17720183ed61SJeremy L Thompson     input_matrix_reuse[i].index = -1;
17730183ed61SJeremy L Thompson   }
17740183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
17750183ed61SJeremy L Thompson     CeedEvalMode eval_mode_i;
17760183ed61SJeremy L Thompson     CeedBasis    basis_i;
17770183ed61SJeremy L Thompson 
17780183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
17790183ed61SJeremy L Thompson     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
17800183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
17810183ed61SJeremy L Thompson     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
17820183ed61SJeremy L Thompson       CeedEvalMode eval_mode_j;
17830183ed61SJeremy L Thompson       CeedBasis    basis_j;
17840183ed61SJeremy L Thompson 
17850183ed61SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
17860183ed61SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
17870183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
17880183ed61SJeremy L Thompson       if (basis_i == basis_j) {
17890183ed61SJeremy L Thompson         input_matrix_reuse[i].index     = j;
17900183ed61SJeremy L Thompson         input_matrix_reuse[i].is_input  = true;
17910183ed61SJeremy L Thompson         input_matrix_reuse[i].eval_mode = eval_mode_j;
17920183ed61SJeremy L Thompson       }
17930183ed61SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
17940183ed61SJeremy L Thompson     }
17950183ed61SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
17960183ed61SJeremy L Thompson   }
17970183ed61SJeremy L Thompson 
17980183ed61SJeremy L Thompson   // -- Determine output mat reuse
17990183ed61SJeremy L Thompson   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
18000183ed61SJeremy L Thompson 
18010183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
18020183ed61SJeremy L Thompson     output_matrix_reuse[i].index = -1;
18030183ed61SJeremy L Thompson   }
18040183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
18050183ed61SJeremy L Thompson     CeedEvalMode eval_mode_i;
18060183ed61SJeremy L Thompson     CeedBasis    basis_i;
18070183ed61SJeremy L Thompson 
18080183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
18090183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
18100183ed61SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
18110183ed61SJeremy L Thompson       CeedEvalMode eval_mode_j;
18120183ed61SJeremy L Thompson       CeedBasis    basis_j;
18130183ed61SJeremy L Thompson 
18140183ed61SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
18150183ed61SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
18160183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
18170183ed61SJeremy L Thompson       if (basis_i == basis_j) {
18180183ed61SJeremy L Thompson         output_matrix_reuse[i].index     = j;
18190183ed61SJeremy L Thompson         output_matrix_reuse[i].is_input  = true;
18200183ed61SJeremy L Thompson         output_matrix_reuse[i].eval_mode = eval_mode_j;
18210183ed61SJeremy L Thompson       }
18220183ed61SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
18230183ed61SJeremy L Thompson     }
18240183ed61SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
18250183ed61SJeremy L Thompson       CeedEvalMode eval_mode_j;
18260183ed61SJeremy L Thompson       CeedBasis    basis_j;
18270183ed61SJeremy L Thompson 
18280183ed61SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
18290183ed61SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
18300183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
18310183ed61SJeremy L Thompson       if (basis_i == basis_j) {
18320183ed61SJeremy L Thompson         output_matrix_reuse[i].index     = j;
18330183ed61SJeremy L Thompson         output_matrix_reuse[i].is_input  = false;
18340183ed61SJeremy L Thompson         output_matrix_reuse[i].eval_mode = eval_mode_j;
18350183ed61SJeremy L Thompson       }
18360183ed61SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
18370183ed61SJeremy L Thompson     }
18380183ed61SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
18390183ed61SJeremy L Thompson   }
18400183ed61SJeremy L Thompson 
18410183ed61SJeremy L Thompson   // Initialize constants, and matrices B and G
18420183ed61SJeremy L Thompson   code << "\n" << tab << "// Input field constants and basis data\n";
18430183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
18440183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1845ca1da9b9SJeremy L Thompson                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, false));
18460183ed61SJeremy L Thompson   }
18470183ed61SJeremy L Thompson   code << "\n" << tab << "// Output field constants and basis data\n";
18480183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
18490183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1850ca1da9b9SJeremy L Thompson                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, false));
18510183ed61SJeremy L Thompson   }
18520183ed61SJeremy L Thompson 
18530183ed61SJeremy L Thompson   // Loop over all elements
18540183ed61SJeremy L Thompson   code << "\n" << tab << "// Element loop\n";
18550183ed61SJeremy L Thompson   code << tab << "__syncthreads();\n";
18560183ed61SJeremy L Thompson   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
18570183ed61SJeremy L Thompson   tab.push();
18580183ed61SJeremy L Thompson 
18590183ed61SJeremy L Thompson   // -- Compute minimum buffer space needed
18600183ed61SJeremy L Thompson   CeedInt max_rstr_buffer_size = 1;
18610183ed61SJeremy L Thompson 
18620183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
18630183ed61SJeremy L Thompson     CeedEvalMode eval_mode;
18640183ed61SJeremy L Thompson 
18650183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
18660183ed61SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
18670183ed61SJeremy L Thompson       CeedInt             num_comp;
18680183ed61SJeremy L Thompson       CeedElemRestriction elem_rstr;
18690183ed61SJeremy L Thompson 
18700183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
18710183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
18720183ed61SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
18730183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
18740183ed61SJeremy L Thompson     }
18750183ed61SJeremy L Thompson   }
18760183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
18770183ed61SJeremy L Thompson     CeedEvalMode eval_mode;
18780183ed61SJeremy L Thompson 
18790183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
18800183ed61SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE) {
18810183ed61SJeremy L Thompson       CeedInt             num_comp;
18820183ed61SJeremy L Thompson       CeedElemRestriction elem_rstr;
18830183ed61SJeremy L Thompson 
18840183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
18850183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
18860183ed61SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
18870183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
18880183ed61SJeremy L Thompson     }
18890183ed61SJeremy L Thompson   }
18900183ed61SJeremy L Thompson   code << tab << "// Scratch restriction buffer space\n";
18910183ed61SJeremy L Thompson   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
18920183ed61SJeremy L Thompson 
18930183ed61SJeremy L Thompson   // -- Determine best input field processing order
18940183ed61SJeremy L Thompson   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
18950183ed61SJeremy L Thompson 
18960183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
18970183ed61SJeremy L Thompson     field_rstr_in_buffer[i] = -1;
18980183ed61SJeremy L Thompson     input_field_order[i]    = -1;
18990183ed61SJeremy L Thompson   }
19000183ed61SJeremy L Thompson   {
19010183ed61SJeremy L Thompson     bool    is_ordered[CEED_FIELD_MAX];
19020183ed61SJeremy L Thompson     CeedInt curr_index = 0;
19030183ed61SJeremy L Thompson 
19040183ed61SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
19050183ed61SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
19060183ed61SJeremy L Thompson       CeedVector          vec_i;
19070183ed61SJeremy L Thompson       CeedElemRestriction rstr_i;
19080183ed61SJeremy L Thompson 
19090183ed61SJeremy L Thompson       if (is_ordered[i]) continue;
19100183ed61SJeremy L Thompson       field_rstr_in_buffer[i]       = i;
19110183ed61SJeremy L Thompson       is_ordered[i]                 = true;
19120183ed61SJeremy L Thompson       input_field_order[curr_index] = i;
19130183ed61SJeremy L Thompson       curr_index++;
19140183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
19150183ed61SJeremy L Thompson       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
19160183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
19170183ed61SJeremy L Thompson       for (CeedInt j = i + 1; j < num_input_fields; j++) {
19180183ed61SJeremy L Thompson         CeedVector          vec_j;
19190183ed61SJeremy L Thompson         CeedElemRestriction rstr_j;
19200183ed61SJeremy L Thompson 
19210183ed61SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
19220183ed61SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
19230183ed61SJeremy L Thompson         if (rstr_i == rstr_j && vec_i == vec_j) {
19240183ed61SJeremy L Thompson           field_rstr_in_buffer[j]       = i;
19250183ed61SJeremy L Thompson           is_ordered[j]                 = true;
19260183ed61SJeremy L Thompson           input_field_order[curr_index] = j;
19270183ed61SJeremy L Thompson           curr_index++;
19280183ed61SJeremy L Thompson         }
19290183ed61SJeremy L Thompson         CeedCallBackend(CeedVectorDestroy(&vec_j));
19300183ed61SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
19310183ed61SJeremy L Thompson       }
19320183ed61SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec_i));
19330183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
19340183ed61SJeremy L Thompson     }
19350183ed61SJeremy L Thompson   }
19360183ed61SJeremy L Thompson 
19370183ed61SJeremy L Thompson   // -- Input restriction and basis
19380183ed61SJeremy L Thompson   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
19390183ed61SJeremy L Thompson   CeedInt active_field_index = -1;
19400183ed61SJeremy L Thompson 
19410183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
19420183ed61SJeremy L Thompson     bool          is_active = false;
19430183ed61SJeremy L Thompson     const char   *field_name;
19440183ed61SJeremy L Thompson     const CeedInt f = input_field_order[i];
19450183ed61SJeremy L Thompson 
19460183ed61SJeremy L Thompson     {
19470183ed61SJeremy L Thompson       CeedVector vec;
19480183ed61SJeremy L Thompson 
19490183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
19500183ed61SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
19510183ed61SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
19520183ed61SJeremy L Thompson     }
19530183ed61SJeremy L Thompson 
19540183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
19550183ed61SJeremy L Thompson     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
19560183ed61SJeremy L Thompson 
19570183ed61SJeremy L Thompson     if (is_active) {
19580183ed61SJeremy L Thompson       std::string var_suffix = "_in_" + std::to_string(f);
19590183ed61SJeremy L Thompson 
19600183ed61SJeremy L Thompson       code << tab << "// Active field - no restriction or basis action here\n";
19610183ed61SJeremy L Thompson       if (active_field_index == -1) {
19620183ed61SJeremy L Thompson         active_field_index = f;
19630183ed61SJeremy L Thompson         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
19640183ed61SJeremy L Thompson              << "] = {0.0};\n";
19650183ed61SJeremy L Thompson       } else {
19660183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
19670183ed61SJeremy L Thompson       }
19680183ed61SJeremy L Thompson     } else {
19690183ed61SJeremy L Thompson       // ---- Restriction
19700183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
19710183ed61SJeremy L Thompson                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
19720183ed61SJeremy L Thompson 
19730183ed61SJeremy L Thompson       // ---- Basis action
19740183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
19750183ed61SJeremy L Thompson                                                            is_all_tensor, is_at_points, use_3d_slices));
19760183ed61SJeremy L Thompson     }
19770183ed61SJeremy L Thompson   }
19780183ed61SJeremy L Thompson 
19790183ed61SJeremy L Thompson   // -- Loop over active field
19800183ed61SJeremy L Thompson   std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
19810183ed61SJeremy L Thompson 
19820183ed61SJeremy L Thompson   code << "\n" << tab << "// Loop over nodes in active field\n";
19830183ed61SJeremy L Thompson   code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
19840183ed61SJeremy L Thompson        << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
19850183ed61SJeremy L Thompson   tab.push();
19860183ed61SJeremy L Thompson 
19870183ed61SJeremy L Thompson   // -- Set current active node and component to 1
19880183ed61SJeremy L Thompson   code << tab << "// Set current active node and component to 1.0\n";
19890183ed61SJeremy L Thompson   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
19900183ed61SJeremy L Thompson        << active_var_suffix << ");\n\n";
19910183ed61SJeremy L Thompson 
19920183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
19930183ed61SJeremy L Thompson     bool          is_active = false;
19940183ed61SJeremy L Thompson     const char   *field_name;
19950183ed61SJeremy L Thompson     const CeedInt f = input_field_order[i];
19960183ed61SJeremy L Thompson 
19970183ed61SJeremy L Thompson     {
19980183ed61SJeremy L Thompson       CeedVector vec;
19990183ed61SJeremy L Thompson 
20000183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
20010183ed61SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
20020183ed61SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
20030183ed61SJeremy L Thompson     }
20040183ed61SJeremy L Thompson     if (!is_active) continue;
20050183ed61SJeremy L Thompson 
20060183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
20070183ed61SJeremy L Thompson     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
20080183ed61SJeremy L Thompson 
20090183ed61SJeremy L Thompson     // ---- Basis action
20100183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
20110183ed61SJeremy L Thompson                                                          is_all_tensor, is_at_points, use_3d_slices));
20120183ed61SJeremy L Thompson   }
20130183ed61SJeremy L Thompson 
20140183ed61SJeremy L Thompson   // -- Q function
20150183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
20160183ed61SJeremy L Thompson                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
2017*745f16d1SZach Atkins                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
20180183ed61SJeremy L Thompson 
20190183ed61SJeremy L Thompson   // -- Output basis and restriction
20200183ed61SJeremy L Thompson   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
20210183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
20220183ed61SJeremy L Thompson     bool        is_active = false;
20230183ed61SJeremy L Thompson     const char *field_name;
20240183ed61SJeremy L Thompson 
20250183ed61SJeremy L Thompson     {
20260183ed61SJeremy L Thompson       CeedVector vec;
20270183ed61SJeremy L Thompson 
20280183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
20290183ed61SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
20300183ed61SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
20310183ed61SJeremy L Thompson     }
20320183ed61SJeremy L Thompson     if (!is_active) continue;
20330183ed61SJeremy L Thompson 
20340183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
20350183ed61SJeremy L Thompson     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
20360183ed61SJeremy L Thompson 
20370183ed61SJeremy L Thompson     // ---- Basis action
20380183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
20390183ed61SJeremy L Thompson                                                          is_all_tensor, is_at_points, use_3d_slices));
20400183ed61SJeremy L Thompson 
20410183ed61SJeremy L Thompson     // ---- Restriction
20420183ed61SJeremy L Thompson     if (is_full) {
2043692716b7SZach Atkins       std::string         var_suffix = "_out_" + std::to_string(i);
2044692716b7SZach Atkins       CeedInt             comp_stride;
2045692716b7SZach Atkins       CeedSize            l_size;
2046692716b7SZach Atkins       CeedElemRestriction elem_rstr;
2047692716b7SZach Atkins 
2048692716b7SZach Atkins       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2049692716b7SZach Atkins       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2050692716b7SZach Atkins       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2051692716b7SZach Atkins       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2052692716b7SZach Atkins       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2053692716b7SZach Atkins       code << tab << "WriteLVecStandard" << max_dim << "d_Assembly<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2054692716b7SZach Atkins            << ">(data, l_size" << var_suffix << ", elem, n, r_e" << var_suffix << ", values_array);\n";
2055692716b7SZach Atkins       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
20560183ed61SJeremy L Thompson     } else {
20570183ed61SJeremy L Thompson       std::string         var_suffix = "_out_" + std::to_string(i);
20580183ed61SJeremy L Thompson       CeedInt             comp_stride;
20590183ed61SJeremy L Thompson       CeedSize            l_size;
20600183ed61SJeremy L Thompson       CeedElemRestriction elem_rstr;
20610183ed61SJeremy L Thompson 
20620183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
20630183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
20640183ed61SJeremy L Thompson       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
20650183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
20660183ed61SJeremy L Thompson       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
20670183ed61SJeremy L Thompson       code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
20680183ed61SJeremy L Thompson            << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
20690183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
20700183ed61SJeremy L Thompson     }
20710183ed61SJeremy L Thompson   }
20720183ed61SJeremy L Thompson 
20730183ed61SJeremy L Thompson   // -- Reset current active node and component
20740183ed61SJeremy L Thompson   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
20750183ed61SJeremy L Thompson   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
20760183ed61SJeremy L Thompson        << active_var_suffix << ");\n";
20770183ed61SJeremy L Thompson 
20780183ed61SJeremy L Thompson   // -- End of loop over active field
20790183ed61SJeremy L Thompson   tab.pop();
20800183ed61SJeremy L Thompson   code << tab << "}\n";
20810183ed61SJeremy L Thompson 
20820183ed61SJeremy L Thompson   // Close loop and function
20830183ed61SJeremy L Thompson   tab.pop();
20840183ed61SJeremy L Thompson   code << tab << "}\n";
20850183ed61SJeremy L Thompson   tab.pop();
20860183ed61SJeremy L Thompson   code << tab << "}\n";
20870183ed61SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n\n";
20880183ed61SJeremy L Thompson 
20890183ed61SJeremy L Thompson   CeedInt block_sizes[3] = {0, 0, 0};
20900183ed61SJeremy L Thompson   CeedInt num_elem;
20910183ed61SJeremy L Thompson 
20920183ed61SJeremy L Thompson   // Compile
20930183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
20940183ed61SJeremy L Thompson   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
20950183ed61SJeremy L Thompson   {
20960183ed61SJeremy L Thompson     bool is_compile_good = false;
20970183ed61SJeremy L Thompson 
20980183ed61SJeremy L Thompson     data->thread_1d = block_sizes[0];
20990183ed61SJeremy L Thompson     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good,
21000183ed61SJeremy L Thompson                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0],
21010183ed61SJeremy L Thompson                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
21020183ed61SJeremy L Thompson     if (is_compile_good) {
21030183ed61SJeremy L Thompson       *is_good_build = true;
21040183ed61SJeremy L Thompson       CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
21050183ed61SJeremy L Thompson                                         is_full ? &data->assemble_full : &data->assemble_diagonal));
21060183ed61SJeremy L Thompson     } else {
21070183ed61SJeremy L Thompson       *is_good_build              = false;
21080183ed61SJeremy L Thompson       data->use_assembly_fallback = true;
21090183ed61SJeremy L Thompson     }
21100183ed61SJeremy L Thompson   }
21110183ed61SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
21120183ed61SJeremy L Thompson   CeedCallBackend(CeedQFunctionDestroy(&qf));
21130183ed61SJeremy L Thompson   return CEED_ERROR_SUCCESS;
21140183ed61SJeremy L Thompson }
21150183ed61SJeremy L Thompson 
CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op,bool * is_good_build)21160183ed61SJeremy L Thompson extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
21170183ed61SJeremy L Thompson   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build);
21180183ed61SJeremy L Thompson }
21190183ed61SJeremy L Thompson 
CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op,bool * is_good_build)2120692716b7SZach Atkins extern "C" int CeedOperatorBuildKernelFullAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2121692716b7SZach Atkins   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, true, is_good_build);
2122692716b7SZach Atkins }
21235daefc96SJeremy L Thompson //------------------------------------------------------------------------------
21245daefc96SJeremy L Thompson // Build QFunction assembly operator kernel
21255daefc96SJeremy L Thompson //------------------------------------------------------------------------------
CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op,bool * is_good_build)21265daefc96SJeremy L Thompson extern "C" int CeedOperatorBuildKernelLinearAssembleQFunction_Hip_gen(CeedOperator op, bool *is_good_build) {
21275daefc96SJeremy L Thompson   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
21285daefc96SJeremy L Thompson   Ceed                   ceed;
21295daefc96SJeremy L Thompson   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0;
21305daefc96SJeremy L Thompson   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
21315daefc96SJeremy L Thompson   CeedQFunction_Hip_gen *qf_data;
21325daefc96SJeremy L Thompson   CeedQFunction          qf;
21335daefc96SJeremy L Thompson   CeedOperatorField     *op_input_fields, *op_output_fields;
21345daefc96SJeremy L Thompson   CeedOperator_Hip_gen  *data;
21355daefc96SJeremy L Thompson   std::ostringstream     code;
21365daefc96SJeremy L Thompson   Tab                    tab;
21375daefc96SJeremy L Thompson 
21385daefc96SJeremy L Thompson   // Check compatibility
21395daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
21405daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
21415daefc96SJeremy L Thompson   CeedCheck(!is_at_points, ceed, CEED_ERROR_BACKEND, "AtPoints QFunction assembly is not supported");
21425daefc96SJeremy L Thompson 
21435daefc96SJeremy L Thompson   // Check field compatibility
21445daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
21455daefc96SJeremy L Thompson   {
21465daefc96SJeremy L Thompson     bool has_shared_bases = true;
21475daefc96SJeremy L Thompson 
21485daefc96SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
21495daefc96SJeremy L Thompson       CeedBasis basis;
21505daefc96SJeremy L Thompson 
21515daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
21525daefc96SJeremy L Thompson       if (basis != CEED_BASIS_NONE) {
21535daefc96SJeremy L Thompson         bool        is_tensor = true;
21545daefc96SJeremy L Thompson         const char *resource;
21555daefc96SJeremy L Thompson         char       *resource_root;
21565daefc96SJeremy L Thompson         Ceed        basis_ceed;
21575daefc96SJeremy L Thompson 
21585daefc96SJeremy L Thompson         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
21595daefc96SJeremy L Thompson         is_all_tensor    = is_all_tensor && is_tensor;
21605daefc96SJeremy L Thompson         is_all_nontensor = is_all_nontensor && !is_tensor;
21615daefc96SJeremy L Thompson         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
21625daefc96SJeremy L Thompson         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
21635daefc96SJeremy L Thompson         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
21645daefc96SJeremy L Thompson         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
21655daefc96SJeremy L Thompson         CeedCallBackend(CeedFree(&resource_root));
21665daefc96SJeremy L Thompson         CeedCallBackend(CeedDestroy(&basis_ceed));
21675daefc96SJeremy L Thompson       }
21685daefc96SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis));
21695daefc96SJeremy L Thompson     }
21705daefc96SJeremy L Thompson 
21715daefc96SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
21725daefc96SJeremy L Thompson       CeedBasis basis;
21735daefc96SJeremy L Thompson 
21745daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
21755daefc96SJeremy L Thompson       if (basis != CEED_BASIS_NONE) {
21765daefc96SJeremy L Thompson         bool        is_tensor = true;
21775daefc96SJeremy L Thompson         const char *resource;
21785daefc96SJeremy L Thompson         char       *resource_root;
21795daefc96SJeremy L Thompson         Ceed        basis_ceed;
21805daefc96SJeremy L Thompson 
21815daefc96SJeremy L Thompson         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
21825daefc96SJeremy L Thompson         is_all_tensor    = is_all_tensor && is_tensor;
21835daefc96SJeremy L Thompson         is_all_nontensor = is_all_nontensor && !is_tensor;
21845daefc96SJeremy L Thompson 
21855daefc96SJeremy L Thompson         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
21865daefc96SJeremy L Thompson         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
21875daefc96SJeremy L Thompson         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
21885daefc96SJeremy L Thompson         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
21895daefc96SJeremy L Thompson         CeedCallBackend(CeedFree(&resource_root));
21905daefc96SJeremy L Thompson         CeedCallBackend(CeedDestroy(&basis_ceed));
21915daefc96SJeremy L Thompson       }
21925daefc96SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis));
21935daefc96SJeremy L Thompson     }
21945daefc96SJeremy L Thompson   }
21955daefc96SJeremy L Thompson 
21965daefc96SJeremy L Thompson   // Retrieve operator data
21975daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorGetData(op, &data));
21985daefc96SJeremy L Thompson   Q       = data->Q;
21995daefc96SJeremy L Thompson   Q_1d    = data->Q_1d;
22005daefc96SJeremy L Thompson   max_dim = data->dim;
22015daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
22025daefc96SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
22035daefc96SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
22045daefc96SJeremy L Thompson 
22055daefc96SJeremy L Thompson   // Load basis source files
22065daefc96SJeremy L Thompson   if (!is_all_nontensor) {
22075daefc96SJeremy L Thompson     code << tab << "// Tensor basis source\n";
22085daefc96SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
22095daefc96SJeremy L Thompson   }
22105daefc96SJeremy L Thompson   if (!is_all_tensor) {
22115daefc96SJeremy L Thompson     code << tab << "// Non-tensor basis source\n";
22125daefc96SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
22135daefc96SJeremy L Thompson   }
22145daefc96SJeremy L Thompson   if (!is_all_tensor && !is_all_nontensor) {
22155daefc96SJeremy L Thompson     code << "// Tensor basis source\n";
22165daefc96SJeremy L Thompson     code << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
22175daefc96SJeremy L Thompson   }
22185daefc96SJeremy L Thompson   code << "// CodeGen operator source\n";
22195daefc96SJeremy L Thompson   code << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
22205daefc96SJeremy L Thompson 
22215daefc96SJeremy L Thompson   // Get QFunction name
22225daefc96SJeremy L Thompson   std::string qfunction_name(qf_data->qfunction_name);
22235daefc96SJeremy L Thompson   std::string operator_name;
22245daefc96SJeremy L Thompson 
22255daefc96SJeremy L Thompson   operator_name = "CeedKernelHipGenQFunctionAssembly_" + qfunction_name;
22265daefc96SJeremy L Thompson 
22275daefc96SJeremy L Thompson   // Define CEED_Q_VLA
22285daefc96SJeremy L Thompson   code << "\n" << tab << "#undef CEED_Q_VLA\n";
22295daefc96SJeremy L Thompson   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
22305daefc96SJeremy L Thompson     code << tab << "#define CEED_Q_VLA 1\n\n";
22315daefc96SJeremy L Thompson   } else {
22325daefc96SJeremy L Thompson     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
22335daefc96SJeremy L Thompson   }
22345daefc96SJeremy L Thompson 
22355daefc96SJeremy L Thompson   // Add user QFunction source
22365daefc96SJeremy L Thompson   {
22375daefc96SJeremy L Thompson     const char *source_path;
22385daefc96SJeremy L Thompson 
22395daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
22405daefc96SJeremy L Thompson     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
22415daefc96SJeremy L Thompson 
22425daefc96SJeremy L Thompson     code << tab << "// User QFunction source\n";
22435daefc96SJeremy L Thompson     code << tab << "#include \"" << source_path << "\"\n\n";
22445daefc96SJeremy L Thompson   }
22455daefc96SJeremy L Thompson 
22465daefc96SJeremy L Thompson   // Setup
22475daefc96SJeremy L Thompson   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
22485daefc96SJeremy L Thompson   code << tab << "// Operator Assembly Kernel\n";
22495daefc96SJeremy L Thompson   code << tab << "// \n";
22505daefc96SJeremy L Thompson   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
22515daefc96SJeremy L Thompson   code << tab << "// r_[in,out]_e_i: Element vector register\n";
22525daefc96SJeremy L Thompson   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
22535daefc96SJeremy L Thompson   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
22545daefc96SJeremy L Thompson   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
22555daefc96SJeremy L Thompson   code << tab << "// \n";
22565daefc96SJeremy L Thompson   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
22575daefc96SJeremy L Thompson   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
22585daefc96SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n";
22595daefc96SJeremy L Thompson   code << tab << "extern \"C\" __global__ void " << operator_name
22605daefc96SJeremy L Thompson        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
22615daefc96SJeremy L Thompson           "points, CeedScalar *__restrict__ values_array) {\n";
22625daefc96SJeremy L Thompson   tab.push();
22635daefc96SJeremy L Thompson 
22645daefc96SJeremy L Thompson   // Scratch buffers
22655daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
22665daefc96SJeremy L Thompson     CeedEvalMode eval_mode;
22675daefc96SJeremy L Thompson 
22685daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
22695daefc96SJeremy L Thompson     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
22705daefc96SJeremy L Thompson       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
22715daefc96SJeremy L Thompson     }
22725daefc96SJeremy L Thompson   }
22735daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
2274*745f16d1SZach Atkins     bool is_active = false;
2275*745f16d1SZach Atkins 
2276*745f16d1SZach Atkins     {
2277*745f16d1SZach Atkins       CeedVector vec;
2278*745f16d1SZach Atkins 
2279*745f16d1SZach Atkins       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
2280*745f16d1SZach Atkins       is_active = vec == CEED_VECTOR_ACTIVE;
2281*745f16d1SZach Atkins       CeedCallBackend(CeedVectorDestroy(&vec));
2282*745f16d1SZach Atkins     }
2283*745f16d1SZach Atkins     if (is_active) {
22845daefc96SJeremy L Thompson       code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
22855daefc96SJeremy L Thompson     }
2286*745f16d1SZach Atkins   }
22875daefc96SJeremy L Thompson 
22885daefc96SJeremy L Thompson   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
22895daefc96SJeremy L Thompson   if (!is_all_tensor) {
22905daefc96SJeremy L Thompson     code << tab << "const CeedInt Q = " << Q << ";\n";
22915daefc96SJeremy L Thompson   }
22925daefc96SJeremy L Thompson   if (!is_all_nontensor) {
22935daefc96SJeremy L Thompson     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
22945daefc96SJeremy L Thompson   }
22955daefc96SJeremy L Thompson 
22965daefc96SJeremy L Thompson   // Shared data
22975daefc96SJeremy L Thompson   code << tab << "extern __shared__ CeedScalar slice[];\n";
22985daefc96SJeremy L Thompson   code << tab << "SharedData_Hip data;\n";
22995daefc96SJeremy L Thompson   code << tab << "data.t_id_x = threadIdx.x;\n";
23005daefc96SJeremy L Thompson   code << tab << "data.t_id_y = threadIdx.y;\n";
23015daefc96SJeremy L Thompson   code << tab << "data.t_id_z = threadIdx.z;\n";
23025daefc96SJeremy L Thompson   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
23035daefc96SJeremy L Thompson   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
23045daefc96SJeremy L Thompson 
23055daefc96SJeremy L Thompson   // -- Determine input mat reuse
23065daefc96SJeremy L Thompson   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
23075daefc96SJeremy L Thompson 
23085daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
23095daefc96SJeremy L Thompson     input_matrix_reuse[i].index = -1;
23105daefc96SJeremy L Thompson   }
23115daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
23125daefc96SJeremy L Thompson     bool         is_tensor = true;
23135daefc96SJeremy L Thompson     CeedEvalMode eval_mode_i;
23145daefc96SJeremy L Thompson     CeedBasis    basis_i;
23155daefc96SJeremy L Thompson 
23165daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
23175daefc96SJeremy L Thompson     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
23185daefc96SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
23195daefc96SJeremy L Thompson     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
23205daefc96SJeremy L Thompson     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
23215daefc96SJeremy L Thompson       CeedEvalMode eval_mode_j;
23225daefc96SJeremy L Thompson       CeedBasis    basis_j;
23235daefc96SJeremy L Thompson 
23245daefc96SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
23255daefc96SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
23265daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
23275daefc96SJeremy L Thompson       if (basis_i == basis_j) {
23285daefc96SJeremy L Thompson         if (is_tensor) {
23295daefc96SJeremy L Thompson           input_matrix_reuse[i].index     = j;
23305daefc96SJeremy L Thompson           input_matrix_reuse[i].is_input  = true;
23315daefc96SJeremy L Thompson           input_matrix_reuse[i].eval_mode = eval_mode_j;
23325daefc96SJeremy L Thompson         } else {
23335daefc96SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
23345daefc96SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
23355daefc96SJeremy L Thompson             input_matrix_reuse[i].index     = j;
23365daefc96SJeremy L Thompson             input_matrix_reuse[i].is_input  = true;
23375daefc96SJeremy L Thompson             input_matrix_reuse[i].eval_mode = eval_mode_j;
23385daefc96SJeremy L Thompson           }
23395daefc96SJeremy L Thompson         }
23405daefc96SJeremy L Thompson       }
23415daefc96SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
23425daefc96SJeremy L Thompson     }
23435daefc96SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
23445daefc96SJeremy L Thompson   }
23455daefc96SJeremy L Thompson 
23465daefc96SJeremy L Thompson   // -- Determine output mat reuse
23475daefc96SJeremy L Thompson   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
23485daefc96SJeremy L Thompson 
23495daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
23505daefc96SJeremy L Thompson     output_matrix_reuse[i].index = -1;
23515daefc96SJeremy L Thompson   }
23525daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
23535daefc96SJeremy L Thompson     bool         is_tensor = true;
23545daefc96SJeremy L Thompson     CeedEvalMode eval_mode_i;
23555daefc96SJeremy L Thompson     CeedBasis    basis_i;
23565daefc96SJeremy L Thompson 
23575daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
23585daefc96SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
23595daefc96SJeremy L Thompson     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
23605daefc96SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
23615daefc96SJeremy L Thompson       CeedEvalMode eval_mode_j;
23625daefc96SJeremy L Thompson       CeedBasis    basis_j;
23635daefc96SJeremy L Thompson 
23645daefc96SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
23655daefc96SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
23665daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
23675daefc96SJeremy L Thompson       if (basis_i == basis_j) {
23685daefc96SJeremy L Thompson         if (is_tensor) {
23695daefc96SJeremy L Thompson           output_matrix_reuse[i].index     = j;
23705daefc96SJeremy L Thompson           output_matrix_reuse[i].is_input  = true;
23715daefc96SJeremy L Thompson           output_matrix_reuse[i].eval_mode = eval_mode_j;
23725daefc96SJeremy L Thompson         } else {
23735daefc96SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
23745daefc96SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
23755daefc96SJeremy L Thompson             output_matrix_reuse[i].index     = j;
23765daefc96SJeremy L Thompson             output_matrix_reuse[i].is_input  = true;
23775daefc96SJeremy L Thompson             output_matrix_reuse[i].eval_mode = eval_mode_j;
23785daefc96SJeremy L Thompson           }
23795daefc96SJeremy L Thompson         }
23805daefc96SJeremy L Thompson       }
23815daefc96SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
23825daefc96SJeremy L Thompson     }
23835daefc96SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
23845daefc96SJeremy L Thompson       CeedEvalMode eval_mode_j;
23855daefc96SJeremy L Thompson       CeedBasis    basis_j;
23865daefc96SJeremy L Thompson 
23875daefc96SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
23885daefc96SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
23895daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
23905daefc96SJeremy L Thompson       if (basis_i == basis_j) {
23915daefc96SJeremy L Thompson         if (is_tensor) {
23925daefc96SJeremy L Thompson           output_matrix_reuse[i].index     = j;
23935daefc96SJeremy L Thompson           output_matrix_reuse[i].is_input  = false;
23945daefc96SJeremy L Thompson           output_matrix_reuse[i].eval_mode = eval_mode_j;
23955daefc96SJeremy L Thompson         } else {
23965daefc96SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
23975daefc96SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
23985daefc96SJeremy L Thompson             output_matrix_reuse[i].index     = j;
23995daefc96SJeremy L Thompson             output_matrix_reuse[i].is_input  = false;
24005daefc96SJeremy L Thompson             output_matrix_reuse[i].eval_mode = eval_mode_j;
24015daefc96SJeremy L Thompson           }
24025daefc96SJeremy L Thompson         }
24035daefc96SJeremy L Thompson       }
24045daefc96SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
24055daefc96SJeremy L Thompson     }
24065daefc96SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
24075daefc96SJeremy L Thompson   }
24085daefc96SJeremy L Thompson 
24095daefc96SJeremy L Thompson   // Initialize constants, and matrices B and G
24105daefc96SJeremy L Thompson   code << "\n" << tab << "// Input field constants and basis data\n";
24115daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
24125daefc96SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
2413ca1da9b9SJeremy L Thompson                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices, true));
24145daefc96SJeremy L Thompson   }
24155daefc96SJeremy L Thompson   code << "\n" << tab << "// Output field constants and basis data\n";
24165daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
24175daefc96SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
2418ca1da9b9SJeremy L Thompson                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices, true));
24195daefc96SJeremy L Thompson   }
24205daefc96SJeremy L Thompson 
24215daefc96SJeremy L Thompson   // Loop over all elements
24225daefc96SJeremy L Thompson   code << "\n" << tab << "// Element loop\n";
24235daefc96SJeremy L Thompson   code << tab << "__syncthreads();\n";
24245daefc96SJeremy L Thompson   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
24255daefc96SJeremy L Thompson   tab.push();
24265daefc96SJeremy L Thompson 
24275daefc96SJeremy L Thompson   // -- Compute minimum buffer space needed
24285daefc96SJeremy L Thompson   CeedInt max_rstr_buffer_size = 1;
24295daefc96SJeremy L Thompson 
24305daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
24315daefc96SJeremy L Thompson     CeedEvalMode eval_mode;
24325daefc96SJeremy L Thompson 
24335daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
24345daefc96SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
24355daefc96SJeremy L Thompson       CeedInt             num_comp;
24365daefc96SJeremy L Thompson       CeedElemRestriction elem_rstr;
24375daefc96SJeremy L Thompson 
24385daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
24395daefc96SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
24405daefc96SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
24415daefc96SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
24425daefc96SJeremy L Thompson     }
24435daefc96SJeremy L Thompson   }
24445daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
24455daefc96SJeremy L Thompson     CeedEvalMode eval_mode;
24465daefc96SJeremy L Thompson 
24475daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
24485daefc96SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE) {
24495daefc96SJeremy L Thompson       CeedInt             num_comp;
24505daefc96SJeremy L Thompson       CeedElemRestriction elem_rstr;
24515daefc96SJeremy L Thompson 
24525daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
24535daefc96SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
24545daefc96SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
24555daefc96SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
24565daefc96SJeremy L Thompson     }
24575daefc96SJeremy L Thompson   }
24585daefc96SJeremy L Thompson   code << tab << "// Scratch restriction buffer space\n";
24595daefc96SJeremy L Thompson   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
24605daefc96SJeremy L Thompson 
24615daefc96SJeremy L Thompson   // -- Determine best input field processing order
24625daefc96SJeremy L Thompson   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
24635daefc96SJeremy L Thompson 
24645daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
24655daefc96SJeremy L Thompson     field_rstr_in_buffer[i] = -1;
24665daefc96SJeremy L Thompson     input_field_order[i]    = -1;
24675daefc96SJeremy L Thompson   }
24685daefc96SJeremy L Thompson   {
24695daefc96SJeremy L Thompson     bool    is_ordered[CEED_FIELD_MAX];
24705daefc96SJeremy L Thompson     CeedInt curr_index = 0;
24715daefc96SJeremy L Thompson 
24725daefc96SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
24735daefc96SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
24745daefc96SJeremy L Thompson       CeedVector          vec_i;
24755daefc96SJeremy L Thompson       CeedElemRestriction rstr_i;
24765daefc96SJeremy L Thompson 
24775daefc96SJeremy L Thompson       if (is_ordered[i]) continue;
24785daefc96SJeremy L Thompson       field_rstr_in_buffer[i]       = i;
24795daefc96SJeremy L Thompson       is_ordered[i]                 = true;
24805daefc96SJeremy L Thompson       input_field_order[curr_index] = i;
24815daefc96SJeremy L Thompson       curr_index++;
24825daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
24835daefc96SJeremy L Thompson       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
24845daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
24855daefc96SJeremy L Thompson       for (CeedInt j = i + 1; j < num_input_fields; j++) {
24865daefc96SJeremy L Thompson         CeedVector          vec_j;
24875daefc96SJeremy L Thompson         CeedElemRestriction rstr_j;
24885daefc96SJeremy L Thompson 
24895daefc96SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
24905daefc96SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
24915daefc96SJeremy L Thompson         if (rstr_i == rstr_j && vec_i == vec_j) {
24925daefc96SJeremy L Thompson           field_rstr_in_buffer[j]       = i;
24935daefc96SJeremy L Thompson           is_ordered[j]                 = true;
24945daefc96SJeremy L Thompson           input_field_order[curr_index] = j;
24955daefc96SJeremy L Thompson           curr_index++;
24965daefc96SJeremy L Thompson         }
24975daefc96SJeremy L Thompson         CeedCallBackend(CeedVectorDestroy(&vec_j));
24985daefc96SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
24995daefc96SJeremy L Thompson       }
25005daefc96SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec_i));
25015daefc96SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
25025daefc96SJeremy L Thompson     }
25035daefc96SJeremy L Thompson   }
25045daefc96SJeremy L Thompson 
25055daefc96SJeremy L Thompson   // -- Input restriction and basis
25065daefc96SJeremy L Thompson   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
25075daefc96SJeremy L Thompson   CeedInt num_active_in = 0, num_active_out = 0, qf_assembly_size_out = 0;
25085daefc96SJeremy L Thompson   CeedInt active_fields_in[CEED_FIELD_MAX], active_fields_out[CEED_FIELD_MAX];
25095daefc96SJeremy L Thompson 
25105daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
25115daefc96SJeremy L Thompson     bool          is_active = false;
25125daefc96SJeremy L Thompson     const char   *field_name;
25135daefc96SJeremy L Thompson     const CeedInt f = input_field_order[i];
25145daefc96SJeremy L Thompson 
25155daefc96SJeremy L Thompson     {
25165daefc96SJeremy L Thompson       CeedVector vec;
25175daefc96SJeremy L Thompson 
25185daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
25195daefc96SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
25205daefc96SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
25215daefc96SJeremy L Thompson     }
25225daefc96SJeremy L Thompson 
25235daefc96SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
25245daefc96SJeremy L Thompson     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
25255daefc96SJeremy L Thompson 
25265daefc96SJeremy L Thompson     if (is_active) {
25275daefc96SJeremy L Thompson       CeedEvalMode eval_mode;
25285daefc96SJeremy L Thompson       CeedInt      field_size;
25295daefc96SJeremy L Thompson 
25305daefc96SJeremy L Thompson       active_fields_in[num_active_in] = f;
25315daefc96SJeremy L Thompson       num_active_in++;
25325daefc96SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetSize(qf_input_fields[f], &field_size));
25335daefc96SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[f], &eval_mode));
25345daefc96SJeremy L Thompson       if (eval_mode == CEED_EVAL_GRAD) {
25355daefc96SJeremy L Thompson         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << "dim_in_" << f << "*"
25365daefc96SJeremy L Thompson              << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
25375daefc96SJeremy L Thompson       } else {
25385daefc96SJeremy L Thompson         code << tab << "CeedScalar r_q_in_" << f << "[num_comp_in_" << f << "*" << (is_all_tensor && (max_dim >= 3) ? "Q_1d" : "1") << "] = {0.};\n";
25395daefc96SJeremy L Thompson       }
25405daefc96SJeremy L Thompson       code << tab << "const CeedInt field_size_in_" << f << " = " << field_size << ";\n";
25415daefc96SJeremy L Thompson     } else {
25425daefc96SJeremy L Thompson       // ---- Restriction
25435daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
25445daefc96SJeremy L Thompson                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
25455daefc96SJeremy L Thompson 
25465daefc96SJeremy L Thompson       // ---- Basis action
25475daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
25485daefc96SJeremy L Thompson                                                            is_all_tensor, is_at_points, use_3d_slices));
25495daefc96SJeremy L Thompson     }
25505daefc96SJeremy L Thompson   }
25515daefc96SJeremy L Thompson   code << tab << "const CeedInt field_sizes_in[" << num_active_in << "] = {";
25525daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_active_in; i++) {
25535daefc96SJeremy L Thompson     code << "field_size_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
25545daefc96SJeremy L Thompson   }
25555daefc96SJeremy L Thompson   code << "};\n";
25565daefc96SJeremy L Thompson   code << tab << "CeedScalar * r_q_in[" << num_active_in << "] = {";
25575daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_active_in; i++) {
25585daefc96SJeremy L Thompson     code << "r_q_in_" << active_fields_in[i] << (i < num_active_in - 1 ? ", " : "");
25595daefc96SJeremy L Thompson   }
25605daefc96SJeremy L Thompson   code << "};\n";
25615daefc96SJeremy L Thompson 
25625daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
25635daefc96SJeremy L Thompson     bool is_active = false;
25645daefc96SJeremy L Thompson 
25655daefc96SJeremy L Thompson     {
25665daefc96SJeremy L Thompson       CeedVector vec;
25675daefc96SJeremy L Thompson 
25685daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
25695daefc96SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
25705daefc96SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
25715daefc96SJeremy L Thompson     }
25725daefc96SJeremy L Thompson     if (is_active) {
25735daefc96SJeremy L Thompson       const char *field_name;
25745daefc96SJeremy L Thompson       CeedInt     field_size;
25755daefc96SJeremy L Thompson 
25765daefc96SJeremy L Thompson       active_fields_out[num_active_out] = i;
25775daefc96SJeremy L Thompson       num_active_out++;
25785daefc96SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
25795daefc96SJeremy L Thompson       qf_assembly_size_out += field_size;
25805daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
25815daefc96SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
25825daefc96SJeremy L Thompson       code << tab << "const CeedInt field_size_out_" << i << " = " << field_size << ";\n";
25835daefc96SJeremy L Thompson     }
25845daefc96SJeremy L Thompson   }
25855daefc96SJeremy L Thompson   code << tab << "const CeedInt field_sizes_out[" << num_active_out << "] = {";
25865daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_active_out; i++) {
25875daefc96SJeremy L Thompson     code << "field_size_out_" << active_fields_out[i] << (i < num_active_out - 1 ? ", " : "");
25885daefc96SJeremy L Thompson   }
25895daefc96SJeremy L Thompson   code << "};\n";
25905daefc96SJeremy L Thompson   code << tab << "const CeedInt total_size_out = " << qf_assembly_size_out << ";\n";
25915daefc96SJeremy L Thompson 
25925daefc96SJeremy L Thompson   // -- Loop over active field
25935daefc96SJeremy L Thompson   code << "\n" << tab << "CeedInt input_offset = 0;\n";
25945daefc96SJeremy L Thompson   code << tab << "// Loop over active QFunction input fields\n";
25955daefc96SJeremy L Thompson   code << tab << "const CeedInt num_active_in = " << num_active_in << ";\n";
25965daefc96SJeremy L Thompson   code << tab << "for (CeedInt a = 0; a < num_active_in; a++) {\n";
25975daefc96SJeremy L Thompson   tab.push();
25985daefc96SJeremy L Thompson 
25995daefc96SJeremy L Thompson   // -- Loop over size of active field
26005daefc96SJeremy L Thompson   code << "\n" << tab << "// Loop over current active input field size\n";
26015daefc96SJeremy L Thompson   code << tab << "const CeedInt field_size_in = field_sizes_in[a];\n";
26025daefc96SJeremy L Thompson   code << tab << "for (CeedInt s = 0; s < field_size_in; s++) {\n";
26035daefc96SJeremy L Thompson   tab.push();
26045daefc96SJeremy L Thompson 
26055daefc96SJeremy L Thompson   // -- Set current active point and component to 1
26065daefc96SJeremy L Thompson   code << tab << "// Set current active point and component to 1.0\n";
26075daefc96SJeremy L Thompson   if (is_all_tensor && (max_dim >= 3)) {
26085daefc96SJeremy L Thompson     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 1.0;\n";
26095daefc96SJeremy L Thompson   } else {
26105daefc96SJeremy L Thompson     code << tab << "r_q_in[a][s] = 1.0;\n";
26115daefc96SJeremy L Thompson   }
26125daefc96SJeremy L Thompson 
26135daefc96SJeremy L Thompson   // -- Q function
26145daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
26155daefc96SJeremy L Thompson                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
2616*745f16d1SZach Atkins                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices, true));
26175daefc96SJeremy L Thompson 
26185daefc96SJeremy L Thompson   // -- Output basis and restriction
26195daefc96SJeremy L Thompson   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
26205daefc96SJeremy L Thompson   CeedScalar offset = 0;
26215daefc96SJeremy L Thompson 
26225daefc96SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
26235daefc96SJeremy L Thompson     bool        is_active = false;
26245daefc96SJeremy L Thompson     const char *field_name;
26255daefc96SJeremy L Thompson 
26265daefc96SJeremy L Thompson     {
26275daefc96SJeremy L Thompson       CeedVector vec;
26285daefc96SJeremy L Thompson 
26295daefc96SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
26305daefc96SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
26315daefc96SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
26325daefc96SJeremy L Thompson     }
26335daefc96SJeremy L Thompson     if (!is_active) continue;
26345daefc96SJeremy L Thompson 
26355daefc96SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
26365daefc96SJeremy L Thompson     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
26375daefc96SJeremy L Thompson 
26385daefc96SJeremy L Thompson     // ---- Restriction
26395daefc96SJeremy L Thompson     CeedInt field_size;
26405daefc96SJeremy L Thompson 
26415daefc96SJeremy L Thompson     code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d_QFAssembly<total_size_out, field_size_out_" << i << ", "
26425daefc96SJeremy L Thompson          << (is_all_tensor ? "Q_1d" : "Q") << ">(data, num_elem, elem, input_offset + s, " << offset << ", r_q_out_" << i << ", values_array);\n";
26435daefc96SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetSize(qf_output_fields[i], &field_size));
26445daefc96SJeremy L Thompson     offset += field_size;
26455daefc96SJeremy L Thompson   }
26465daefc96SJeremy L Thompson 
26475daefc96SJeremy L Thompson   // -- Reset current active node and component
26485daefc96SJeremy L Thompson   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
26495daefc96SJeremy L Thompson   if (is_all_tensor && (max_dim >= 3)) {
26505daefc96SJeremy L Thompson     code << tab << "for (CeedInt i = 0; i < Q_1d; i++) r_q_in[a][i + s * Q_1d] = 0.0;\n";
26515daefc96SJeremy L Thompson   } else {
26525daefc96SJeremy L Thompson     code << tab << "r_q_in[a][s] = 0.0;\n";
26535daefc96SJeremy L Thompson   }
26545daefc96SJeremy L Thompson 
26555daefc96SJeremy L Thompson   // -- End of loop over size of active field
26565daefc96SJeremy L Thompson   tab.pop();
26575daefc96SJeremy L Thompson   code << tab << "}\n";
26585daefc96SJeremy L Thompson   code << tab << "input_offset += field_size_in;\n";
26595daefc96SJeremy L Thompson 
26605daefc96SJeremy L Thompson   // -- End of loop over active field
26615daefc96SJeremy L Thompson   tab.pop();
26625daefc96SJeremy L Thompson   code << tab << "}\n";
26635daefc96SJeremy L Thompson 
26645daefc96SJeremy L Thompson   // Close loop and function
26655daefc96SJeremy L Thompson   tab.pop();
26665daefc96SJeremy L Thompson   code << tab << "}\n";
26675daefc96SJeremy L Thompson   tab.pop();
26685daefc96SJeremy L Thompson   code << tab << "}\n";
26695daefc96SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n\n";
26705daefc96SJeremy L Thompson 
26715daefc96SJeremy L Thompson   CeedInt block_sizes[3] = {0, 0, 0};
26725daefc96SJeremy L Thompson   CeedInt num_elem;
26735daefc96SJeremy L Thompson 
26745daefc96SJeremy L Thompson   // Compile
26755daefc96SJeremy L Thompson   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
26765daefc96SJeremy L Thompson   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
26775daefc96SJeremy L Thompson   {
26785daefc96SJeremy L Thompson     bool is_compile_good = false;
26795daefc96SJeremy L Thompson 
26805daefc96SJeremy L Thompson     data->thread_1d = block_sizes[0];
26815daefc96SJeremy L Thompson     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module_assemble_qfunction, 2, "OP_T_1D", block_sizes[0],
26825daefc96SJeremy L Thompson                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
26835daefc96SJeremy L Thompson     if (is_compile_good) {
26845daefc96SJeremy L Thompson       *is_good_build = true;
26855daefc96SJeremy L Thompson       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module_assemble_qfunction, operator_name.c_str(), &data->assemble_qfunction));
26865daefc96SJeremy L Thompson     } else {
26875daefc96SJeremy L Thompson       *is_good_build              = false;
26885daefc96SJeremy L Thompson       data->use_assembly_fallback = true;
26895daefc96SJeremy L Thompson     }
26905daefc96SJeremy L Thompson   }
26915daefc96SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
26925daefc96SJeremy L Thompson   CeedCallBackend(CeedQFunctionDestroy(&qf));
26935daefc96SJeremy L Thompson   return CEED_ERROR_SUCCESS;
26945daefc96SJeremy L Thompson }
2695692716b7SZach Atkins 
26960183ed61SJeremy L Thompson //------------------------------------------------------------------------------
2697