xref: /libCEED/backends/hip-gen/ceed-hip-gen-operator-build.cpp (revision 0183ed61035d97ff853cf8c8e722c0fda76e54df)
1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
23d8e8822SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
37d8d0e25Snbeams //
43d8e8822SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
57d8d0e25Snbeams //
63d8e8822SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
73d576824SJeremy L Thompson 
87d8d0e25Snbeams #define CEED_DEBUG_COLOR 12
97d8d0e25Snbeams 
1049aac155SJeremy L Thompson #include <ceed.h>
11ec3da8bcSJed Brown #include <ceed/backend.h>
12*0183ed61SJeremy L Thompson #include <ceed/gen-tools.h>
139e201c85SYohann #include <ceed/jit-tools.h>
142b730f8bSJeremy L Thompson 
157d8d0e25Snbeams #include <iostream>
167d8d0e25Snbeams #include <sstream>
172b730f8bSJeremy L Thompson #include <string>
182b730f8bSJeremy L Thompson 
190d0321e0SJeremy L Thompson #include "../hip-ref/ceed-hip-ref.h"
207d8d0e25Snbeams #include "../hip-shared/ceed-hip-shared.h"
21b2165e7aSSebastian Grimberg #include "../hip/ceed-hip-common.h"
227d8d0e25Snbeams #include "../hip/ceed-hip-compile.h"
232b730f8bSJeremy L Thompson #include "ceed-hip-gen.h"
24b3e1519bSnbeams 
2545a787f7SJeremy L Thompson struct FieldReuse_Hip {
2645a787f7SJeremy L Thompson   CeedInt      index;
2745a787f7SJeremy L Thompson   bool         is_input;
2845a787f7SJeremy L Thompson   CeedEvalMode eval_mode;
2945a787f7SJeremy L Thompson };
3045a787f7SJeremy L Thompson 
31b3e1519bSnbeams //------------------------------------------------------------------------------
32b3e1519bSnbeams // Calculate the block size used for launching the operator kernel
33b3e1519bSnbeams //------------------------------------------------------------------------------
342b730f8bSJeremy L Thompson extern "C" int BlockGridCalculate_Hip_gen(const CeedInt dim, const CeedInt num_elem, const CeedInt P_1d, const CeedInt Q_1d, CeedInt *block_sizes) {
353a2968d6SJeremy L Thompson   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
36b3e1519bSnbeams   if (dim == 1) {
373a2968d6SJeremy L Thompson     CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
38b7453713SJeremy L Thompson 
399e201c85SYohann     elems_per_block = elems_per_block > 0 ? elems_per_block : 1;
403a2968d6SJeremy L Thompson     block_sizes[0]  = thread_1d;
41b3e1519bSnbeams     block_sizes[1]  = 1;
429e201c85SYohann     block_sizes[2]  = elems_per_block;
43b3e1519bSnbeams   } else if (dim == 2) {
443a2968d6SJeremy L Thompson     const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2;
45b7453713SJeremy L Thompson 
463a2968d6SJeremy L Thompson     block_sizes[0] = thread_1d;
473a2968d6SJeremy L Thompson     block_sizes[1] = thread_1d;
489e201c85SYohann     block_sizes[2] = elems_per_block;
49b3e1519bSnbeams   } else if (dim == 3) {
503a2968d6SJeremy L Thompson     const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1);
51b7453713SJeremy L Thompson 
523a2968d6SJeremy L Thompson     block_sizes[0] = thread_1d;
533a2968d6SJeremy L Thompson     block_sizes[1] = thread_1d;
549e201c85SYohann     block_sizes[2] = elems_per_block;
55b3e1519bSnbeams   }
56b3e1519bSnbeams   return CEED_ERROR_SUCCESS;
57b3e1519bSnbeams }
58b3e1519bSnbeams 
597d8d0e25Snbeams //------------------------------------------------------------------------------
604b3e95d5SJeremy L Thompson // Determine type of operator
614b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
624b3e95d5SJeremy L Thompson static int CeedOperatorBuildKernelData_Hip_gen(Ceed ceed, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
634b3e95d5SJeremy L Thompson                                                CeedQFunctionField *qf_input_fields, CeedInt num_output_fields, CeedOperatorField *op_output_fields,
6474398b5aSJeremy L Thompson                                                CeedQFunctionField *qf_output_fields, CeedInt *max_P, CeedInt *max_P_1d, CeedInt *Q, CeedInt *Q_1d,
6574398b5aSJeremy L Thompson                                                CeedInt *max_dim, bool *is_all_tensor, bool *use_3d_slices) {
6674398b5aSJeremy L Thompson   // Check if all are tensor
6774398b5aSJeremy L Thompson   *is_all_tensor = true;
684b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
694b3e95d5SJeremy L Thompson     CeedBasis basis;
704b3e95d5SJeremy L Thompson 
714b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
724b3e95d5SJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
734b3e95d5SJeremy L Thompson       bool is_field_tensor;
744b3e95d5SJeremy L Thompson 
754b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
7674398b5aSJeremy L Thompson       *is_all_tensor = *is_all_tensor && is_field_tensor;
774b3e95d5SJeremy L Thompson     }
783a2968d6SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis));
794b3e95d5SJeremy L Thompson   }
804b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
814b3e95d5SJeremy L Thompson     CeedBasis basis;
824b3e95d5SJeremy L Thompson 
834b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
844b3e95d5SJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
854b3e95d5SJeremy L Thompson       bool is_field_tensor;
864b3e95d5SJeremy L Thompson 
874b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
8874398b5aSJeremy L Thompson       *is_all_tensor = *is_all_tensor && is_field_tensor;
8974398b5aSJeremy L Thompson     }
9074398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis));
9174398b5aSJeremy L Thompson   }
9274398b5aSJeremy L Thompson 
9374398b5aSJeremy L Thompson   // Find max_P, max_P_1d, Q, and Q_1d
9474398b5aSJeremy L Thompson   bool is_all_3d = true;
9574398b5aSJeremy L Thompson 
9674398b5aSJeremy L Thompson   *max_P    = 0;
9774398b5aSJeremy L Thompson   *max_P_1d = 0;
9874398b5aSJeremy L Thompson   *Q        = 0;
9974398b5aSJeremy L Thompson   *Q_1d     = 0;
10074398b5aSJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
10174398b5aSJeremy L Thompson     CeedBasis basis;
10274398b5aSJeremy L Thompson 
10374398b5aSJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
10474398b5aSJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
10574398b5aSJeremy L Thompson       bool    is_field_tensor;
10674398b5aSJeremy L Thompson       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
10774398b5aSJeremy L Thompson 
10874398b5aSJeremy L Thompson       // Check if 3D
1094b3e95d5SJeremy L Thompson       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
11074398b5aSJeremy L Thompson       is_all_3d = is_all_3d && (field_dim == 3);
11174398b5aSJeremy L Thompson       *max_dim  = CeedIntMax(*max_dim, field_dim);
11274398b5aSJeremy L Thompson 
11374398b5aSJeremy L Thompson       // Collect P, P_1d, Q, and Q_1d
11474398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
11574398b5aSJeremy L Thompson       *max_P = CeedIntMax(*max_P, field_P);
11674398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
11774398b5aSJeremy L Thompson       if (is_field_tensor) {
11874398b5aSJeremy L Thompson         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
11974398b5aSJeremy L Thompson         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
12074398b5aSJeremy L Thompson       }
12174398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
12274398b5aSJeremy L Thompson       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
12374398b5aSJeremy L Thompson       *Q = field_Q;
12474398b5aSJeremy L Thompson       if (is_field_tensor) {
12574398b5aSJeremy L Thompson         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
1264b3e95d5SJeremy L Thompson         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
1274b3e95d5SJeremy L Thompson         *Q_1d = field_Q_1d;
1284b3e95d5SJeremy L Thompson       }
12974398b5aSJeremy L Thompson     }
13074398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis));
13174398b5aSJeremy L Thompson   }
13274398b5aSJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
13374398b5aSJeremy L Thompson     CeedBasis basis;
13474398b5aSJeremy L Thompson 
13574398b5aSJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
13674398b5aSJeremy L Thompson     if (basis != CEED_BASIS_NONE) {
13774398b5aSJeremy L Thompson       bool    is_field_tensor;
13874398b5aSJeremy L Thompson       CeedInt field_dim = 0, field_P = 0, field_P_1d = 0, field_Q = 0, field_Q_1d = 0;
13974398b5aSJeremy L Thompson 
14074398b5aSJeremy L Thompson       // Check if 3D
14174398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetDimension(basis, &field_dim));
14274398b5aSJeremy L Thompson       is_all_3d = is_all_3d && (field_dim == 3);
14374398b5aSJeremy L Thompson       *max_dim  = CeedIntMax(*max_dim, field_dim);
14474398b5aSJeremy L Thompson 
14574398b5aSJeremy L Thompson       // Collect P, P_1d, Q, and Q_1d
14674398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetNumNodes(basis, &field_P));
14774398b5aSJeremy L Thompson       *max_P = CeedIntMax(*max_P, field_P);
14874398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis, &is_field_tensor));
14974398b5aSJeremy L Thompson       if (is_field_tensor) {
15074398b5aSJeremy L Thompson         CeedCallBackend(CeedBasisGetNumNodes1D(basis, &field_P_1d));
15174398b5aSJeremy L Thompson         *max_P_1d = CeedIntMax(*max_P_1d, field_P_1d);
15274398b5aSJeremy L Thompson       }
15374398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &field_Q));
15474398b5aSJeremy L Thompson       CeedCheck(*Q == 0 || field_Q == *Q, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
15574398b5aSJeremy L Thompson       *Q = field_Q;
15674398b5aSJeremy L Thompson       if (is_field_tensor) {
15774398b5aSJeremy L Thompson         CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &field_Q_1d));
15874398b5aSJeremy L Thompson         CeedCheck(*Q_1d == 0 || field_Q_1d == *Q_1d, ceed, CEED_ERROR_BACKEND, "Quadrature spaces must be compatible");
15974398b5aSJeremy L Thompson         *Q_1d = field_Q_1d;
16074398b5aSJeremy L Thompson       }
16174398b5aSJeremy L Thompson     }
1623a2968d6SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis));
1634b3e95d5SJeremy L Thompson   }
1644b3e95d5SJeremy L Thompson 
1654b3e95d5SJeremy L Thompson   // Only use 3D collocated gradient parallelization strategy when gradient is computed
1664b3e95d5SJeremy L Thompson   *use_3d_slices = false;
16774398b5aSJeremy L Thompson   if (is_all_3d && *is_all_tensor) {
1684b3e95d5SJeremy L Thompson     bool was_grad_found = false;
1694b3e95d5SJeremy L Thompson 
1704b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
1714b3e95d5SJeremy L Thompson       CeedEvalMode eval_mode;
1724b3e95d5SJeremy L Thompson 
1734b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1744b3e95d5SJeremy L Thompson       if (eval_mode == CEED_EVAL_GRAD) {
1754b3e95d5SJeremy L Thompson         CeedBasis_Hip_shared *basis_data;
1764b3e95d5SJeremy L Thompson         CeedBasis             basis;
1774b3e95d5SJeremy L Thompson 
1784b3e95d5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
1794b3e95d5SJeremy L Thompson         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
1804b3e95d5SJeremy L Thompson         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
1814b3e95d5SJeremy L Thompson         was_grad_found = true;
1823a2968d6SJeremy L Thompson         CeedCallBackend(CeedBasisDestroy(&basis));
1834b3e95d5SJeremy L Thompson       }
1844b3e95d5SJeremy L Thompson     }
1854b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
1864b3e95d5SJeremy L Thompson       CeedEvalMode eval_mode;
1874b3e95d5SJeremy L Thompson 
1884b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1894b3e95d5SJeremy L Thompson       if (eval_mode == CEED_EVAL_GRAD) {
1904b3e95d5SJeremy L Thompson         CeedBasis_Hip_shared *basis_data;
1914b3e95d5SJeremy L Thompson         CeedBasis             basis;
1924b3e95d5SJeremy L Thompson 
1934b3e95d5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
1944b3e95d5SJeremy L Thompson         CeedCallBackend(CeedBasisGetData(basis, &basis_data));
1954b3e95d5SJeremy L Thompson         *use_3d_slices = basis_data->d_collo_grad_1d && (was_grad_found ? *use_3d_slices : true);
1964b3e95d5SJeremy L Thompson         was_grad_found = true;
1973a2968d6SJeremy L Thompson         CeedCallBackend(CeedBasisDestroy(&basis));
1984b3e95d5SJeremy L Thompson       }
1994b3e95d5SJeremy L Thompson     }
2004b3e95d5SJeremy L Thompson   }
2014b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2024b3e95d5SJeremy L Thompson }
2034b3e95d5SJeremy L Thompson 
2044b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
2054b3e95d5SJeremy L Thompson // Setup fields
2064b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
207*0183ed61SJeremy L Thompson static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
208*0183ed61SJeremy L Thompson                                                     CeedOperatorField op_field, CeedQFunctionField qf_field, FieldReuse_Hip field_reuse,
209*0183ed61SJeremy L Thompson                                                     CeedInt max_dim, CeedInt Q, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
210*0183ed61SJeremy L Thompson                                                     bool use_3d_slices) {
21174398b5aSJeremy L Thompson   bool      is_tensor = true;
21274398b5aSJeremy L Thompson   CeedBasis basis;
21374398b5aSJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
21474398b5aSJeremy L Thompson   if (basis != CEED_BASIS_NONE) CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
21574398b5aSJeremy L Thompson 
21659fa3f92SJeremy L Thompson   const char           *field_name;
2174b3e95d5SJeremy L Thompson   std::string           var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
2189123fb08SJeremy L Thompson   std::string           P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
2194b3e95d5SJeremy L Thompson   std::string           option_name = (is_input ? "inputs" : "outputs");
2204b3e95d5SJeremy L Thompson   CeedEvalMode          eval_mode   = CEED_EVAL_NONE;
22174398b5aSJeremy L Thompson   CeedInt               elem_size = 0, num_comp = 0, dim = max_dim, P_1d = 0;
2224b3e95d5SJeremy L Thompson   CeedElemRestriction   elem_rstr;
2234b3e95d5SJeremy L Thompson   CeedBasis_Hip_shared *basis_data;
2244b3e95d5SJeremy L Thompson 
2259ee499e5SJeremy L Thompson   // Field reuse info
22645a787f7SJeremy L Thompson   bool use_previous_field = field_reuse.index != -1;
2279ee499e5SJeremy L Thompson 
22859fa3f92SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetName(op_field, &field_name));
229*0183ed61SJeremy L Thompson   code << tab << "// -- " << (is_input ? "Input" : "Output") << " field " << i << ": " << field_name << "\n";
2304b3e95d5SJeremy L Thompson 
2314b3e95d5SJeremy L Thompson   // Get field data
2324b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
2334b3e95d5SJeremy L Thompson   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
2344b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
2354b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
2364b3e95d5SJeremy L Thompson   }
2373a2968d6SJeremy L Thompson   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2384b3e95d5SJeremy L Thompson   if (basis != CEED_BASIS_NONE) {
2394b3e95d5SJeremy L Thompson     CeedCallBackend(CeedBasisGetData(basis, &basis_data));
24074398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
2419123fb08SJeremy L Thompson     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
2429123fb08SJeremy L Thompson     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
2434b3e95d5SJeremy L Thompson   }
2444b3e95d5SJeremy L Thompson   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
2454b3e95d5SJeremy L Thompson 
2464b3e95d5SJeremy L Thompson   // Set field constants
247*0183ed61SJeremy L Thompson   code << tab << "const CeedInt dim" << var_suffix << " = " << dim << ";\n";
24874398b5aSJeremy L Thompson   if (is_tensor && !is_all_tensor) {
24974398b5aSJeremy L Thompson     CeedInt P = 0;
25074398b5aSJeremy L Thompson 
25174398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisGetNumNodes(basis, &P));
252*0183ed61SJeremy L Thompson     code << tab << "const CeedInt P" << var_suffix << " = " << (basis == CEED_BASIS_NONE ? Q : P) << ";\n";
25374398b5aSJeremy L Thompson   }
254*0183ed61SJeremy L Thompson   code << tab << "const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n";
255343e3094SJeremy L Thompson   if (eval_mode != CEED_EVAL_WEIGHT) {
256*0183ed61SJeremy L Thompson     code << tab << "const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n";
2574b3e95d5SJeremy L Thompson   }
2584b3e95d5SJeremy L Thompson 
2594b3e95d5SJeremy L Thompson   // Load basis data
260*0183ed61SJeremy L Thompson   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
2614b3e95d5SJeremy L Thompson   switch (eval_mode) {
2624b3e95d5SJeremy L Thompson     case CEED_EVAL_NONE:
2634b3e95d5SJeremy L Thompson       break;
2644b3e95d5SJeremy L Thompson     case CEED_EVAL_INTERP:
2653a2968d6SJeremy L Thompson       if (is_at_points) {
2663a2968d6SJeremy L Thompson         // AtPoints
2673a2968d6SJeremy L Thompson         if (!basis_data->d_chebyshev_interp_1d) {
2683a2968d6SJeremy L Thompson           CeedSize    interp_bytes;
2693a2968d6SJeremy L Thompson           CeedScalar *chebyshev_interp_1d;
2703a2968d6SJeremy L Thompson 
2713a2968d6SJeremy L Thompson           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
2723a2968d6SJeremy L Thompson           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
2733a2968d6SJeremy L Thompson           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
2743a2968d6SJeremy L Thompson           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
2753a2968d6SJeremy L Thompson           CeedCallHip(CeedBasisReturnCeed(basis),
2763a2968d6SJeremy L Thompson                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
2773a2968d6SJeremy L Thompson           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
2783a2968d6SJeremy L Thompson         }
2793a2968d6SJeremy L Thompson         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
2803a2968d6SJeremy L Thompson         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
2813a2968d6SJeremy L Thompson       } else {
2823a2968d6SJeremy L Thompson         // Standard quadrature
2834b3e95d5SJeremy L Thompson         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
2844b3e95d5SJeremy L Thompson         else data->B.outputs[i] = basis_data->d_interp_1d;
2853a2968d6SJeremy L Thompson       }
2869ee499e5SJeremy L Thompson       if (use_previous_field) {
28745a787f7SJeremy L Thompson         std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
2889ee499e5SJeremy L Thompson 
289*0183ed61SJeremy L Thompson         code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
2909ee499e5SJeremy L Thompson       } else {
291*0183ed61SJeremy L Thompson         code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
292*0183ed61SJeremy L Thompson         code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
2939ee499e5SJeremy L Thompson       }
2944b3e95d5SJeremy L Thompson       break;
2954b3e95d5SJeremy L Thompson     case CEED_EVAL_GRAD:
2963a2968d6SJeremy L Thompson       if (is_at_points) {
2973a2968d6SJeremy L Thompson         // AtPoints
2983a2968d6SJeremy L Thompson         if (!basis_data->d_chebyshev_interp_1d) {
2993a2968d6SJeremy L Thompson           CeedSize    interp_bytes;
3003a2968d6SJeremy L Thompson           CeedScalar *chebyshev_interp_1d;
3013a2968d6SJeremy L Thompson 
3023a2968d6SJeremy L Thompson           interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
3033a2968d6SJeremy L Thompson           CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
3043a2968d6SJeremy L Thompson           CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
3053a2968d6SJeremy L Thompson           CeedCallHip(CeedBasisReturnCeed(basis), hipMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
3063a2968d6SJeremy L Thompson           CeedCallHip(CeedBasisReturnCeed(basis),
3073a2968d6SJeremy L Thompson                       hipMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, hipMemcpyHostToDevice));
3083a2968d6SJeremy L Thompson           CeedCallBackend(CeedFree(&chebyshev_interp_1d));
3093a2968d6SJeremy L Thompson         }
3103a2968d6SJeremy L Thompson         if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
3113a2968d6SJeremy L Thompson         else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
3123a2968d6SJeremy L Thompson       } else {
3133a2968d6SJeremy L Thompson         // Standard quadrature
3144b3e95d5SJeremy L Thompson         if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
3154b3e95d5SJeremy L Thompson         else data->B.outputs[i] = basis_data->d_interp_1d;
3163a2968d6SJeremy L Thompson       }
3179123fb08SJeremy L Thompson       if (is_tensor) {
3189ee499e5SJeremy L Thompson         if (use_previous_field) {
31945a787f7SJeremy L Thompson           std::string reuse_var = "s_B" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
3209ee499e5SJeremy L Thompson 
321*0183ed61SJeremy L Thompson           code << tab << "CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
3229ee499e5SJeremy L Thompson         } else {
323*0183ed61SJeremy L Thompson           code << tab << "__shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
324*0183ed61SJeremy L Thompson           code << tab << "LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
3259123fb08SJeremy L Thompson         }
3269ee499e5SJeremy L Thompson       }
3273a2968d6SJeremy L Thompson       if (is_at_points) break;  // No G mat for AtPoints
3284b3e95d5SJeremy L Thompson       if (use_3d_slices) {
3294b3e95d5SJeremy L Thompson         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
3304b3e95d5SJeremy L Thompson         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
33145a787f7SJeremy L Thompson         if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
33245a787f7SJeremy L Thompson           std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
3339ee499e5SJeremy L Thompson 
334*0183ed61SJeremy L Thompson           code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
3359ee499e5SJeremy L Thompson         } else {
336*0183ed61SJeremy L Thompson           code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
337*0183ed61SJeremy L Thompson           code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
3389ee499e5SJeremy L Thompson         }
3394b3e95d5SJeremy L Thompson       } else {
3404b3e95d5SJeremy L Thompson         bool has_collo_grad = basis_data->d_collo_grad_1d;
3414b3e95d5SJeremy L Thompson 
3424b3e95d5SJeremy L Thompson         if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
3434b3e95d5SJeremy L Thompson         else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
3444b3e95d5SJeremy L Thompson         if (has_collo_grad) {
34545a787f7SJeremy L Thompson           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
34645a787f7SJeremy L Thompson             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
3479ee499e5SJeremy L Thompson 
348*0183ed61SJeremy L Thompson             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
3499ee499e5SJeremy L Thompson           } else {
350*0183ed61SJeremy L Thompson             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
351*0183ed61SJeremy L Thompson             code << tab << "LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
3529ee499e5SJeremy L Thompson           }
3539ee499e5SJeremy L Thompson         } else {
35445a787f7SJeremy L Thompson           if (use_previous_field && field_reuse.eval_mode == CEED_EVAL_GRAD) {
35545a787f7SJeremy L Thompson             std::string reuse_var = "s_G" + ((field_reuse.is_input ? "_in_" : "_out_") + std::to_string(field_reuse.index));
3569ee499e5SJeremy L Thompson 
357*0183ed61SJeremy L Thompson             code << tab << "CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
3584b3e95d5SJeremy L Thompson           } else {
359*0183ed61SJeremy L Thompson             code << tab << "__shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim")
36074398b5aSJeremy L Thompson                  << (is_tensor ? "" : var_suffix) << "];\n";
361*0183ed61SJeremy L Thompson             code << tab << "LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << (is_tensor ? "" : var_suffix) << ">(data, G."
36274398b5aSJeremy L Thompson                  << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
3634b3e95d5SJeremy L Thompson           }
3644b3e95d5SJeremy L Thompson         }
3659ee499e5SJeremy L Thompson       }
3664b3e95d5SJeremy L Thompson       break;
3674b3e95d5SJeremy L Thompson     case CEED_EVAL_WEIGHT:
3684b3e95d5SJeremy L Thompson       break;  // No action
3694b3e95d5SJeremy L Thompson       // LCOV_EXCL_START
3704b3e95d5SJeremy L Thompson     case CEED_EVAL_DIV:
3714b3e95d5SJeremy L Thompson     case CEED_EVAL_CURL:
3724b3e95d5SJeremy L Thompson       break;  // TODO: Not implemented
3734b3e95d5SJeremy L Thompson               // LCOV_EXCL_STOP
3744b3e95d5SJeremy L Thompson   }
3753a2968d6SJeremy L Thompson   CeedCallBackend(CeedBasisDestroy(&basis));
3764b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
3774b3e95d5SJeremy L Thompson }
3784b3e95d5SJeremy L Thompson 
3794b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
3804b3e95d5SJeremy L Thompson // Restriction
3814b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
382*0183ed61SJeremy L Thompson static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i,
383*0183ed61SJeremy L Thompson                                                       CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
384*0183ed61SJeremy L Thompson                                                       CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor, bool is_at_points,
385*0183ed61SJeremy L Thompson                                                       bool use_3d_slices) {
3864b3e95d5SJeremy L Thompson   std::string              var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
38774398b5aSJeremy L Thompson   std::string              P_name     = (is_all_tensor ? "P_1d" : "P") + var_suffix;
3884b3e95d5SJeremy L Thompson   CeedEvalMode             eval_mode  = CEED_EVAL_NONE;
38974398b5aSJeremy L Thompson   CeedInt                  elem_size = 0, num_comp = 0;
3904b3e95d5SJeremy L Thompson   CeedSize                 l_size;
391f815fac9SJeremy L Thompson   CeedRestrictionType      rstr_type = CEED_RESTRICTION_STANDARD;
3924b3e95d5SJeremy L Thompson   CeedElemRestriction_Hip *rstr_data;
3934b3e95d5SJeremy L Thompson   CeedElemRestriction      elem_rstr;
3944b3e95d5SJeremy L Thompson 
3954b3e95d5SJeremy L Thompson   // Get field data
3964b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
3974b3e95d5SJeremy L Thompson   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
398f815fac9SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetType(elem_rstr, &rstr_type));
3994b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
4004b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
4014b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
4024b3e95d5SJeremy L Thompson   }
4034b3e95d5SJeremy L Thompson   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
4044b3e95d5SJeremy L Thompson 
4054b3e95d5SJeremy L Thompson   // Restriction
4064b3e95d5SJeremy L Thompson   if (is_input) {
4074b3e95d5SJeremy L Thompson     // Input
408e93651e5SJeremy L Thompson     if (field_input_buffer[i] != i) {
409e93651e5SJeremy L Thompson       std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);
410e93651e5SJeremy L Thompson 
411e93651e5SJeremy L Thompson       // Restriction was already done for previous input
412*0183ed61SJeremy L Thompson       code << tab << "CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
4133a2968d6SJeremy L Thompson     } else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices && is_at_points)) {
4143a2968d6SJeremy L Thompson       if (eval_mode == CEED_EVAL_NONE && rstr_type != CEED_RESTRICTION_POINTS) {
415e93651e5SJeremy L Thompson         // No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
416*0183ed61SJeremy L Thompson         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
4173a2968d6SJeremy L Thompson       } else if (rstr_type != CEED_RESTRICTION_POINTS) {
418e93651e5SJeremy L Thompson         // Otherwise we're using the scratch space
419*0183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
420e93651e5SJeremy L Thompson       }
421f815fac9SJeremy L Thompson       switch (rstr_type) {
422f815fac9SJeremy L Thompson         case CEED_RESTRICTION_STANDARD: {
4234b3e95d5SJeremy L Thompson           CeedInt comp_stride;
4244b3e95d5SJeremy L Thompson 
4254b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
426*0183ed61SJeremy L Thompson           code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
4274b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
428*0183ed61SJeremy L Thompson           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
4294b3e95d5SJeremy L Thompson           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
430*0183ed61SJeremy L Thompson           code << tab << "ReadLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
431*0183ed61SJeremy L Thompson                << P_name << ">(data, l_size" << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix
432*0183ed61SJeremy L Thompson                << ");\n";
433f815fac9SJeremy L Thompson           break;
434f815fac9SJeremy L Thompson         }
435f815fac9SJeremy L Thompson         case CEED_RESTRICTION_STRIDED: {
4364b3e95d5SJeremy L Thompson           bool    has_backend_strides;
4374b3e95d5SJeremy L Thompson           CeedInt num_elem;
4384b3e95d5SJeremy L Thompson 
4394b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
4404b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
4414b3e95d5SJeremy L Thompson           CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
4424b3e95d5SJeremy L Thompson 
4434b3e95d5SJeremy L Thompson           if (!has_backend_strides) {
4444b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
4454b3e95d5SJeremy L Thompson           }
446*0183ed61SJeremy L Thompson           code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
447*0183ed61SJeremy L Thompson                << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
448*0183ed61SJeremy L Thompson           code << tab << "ReadLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
449*0183ed61SJeremy L Thompson                << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, d" << var_suffix << ", r_e"
450*0183ed61SJeremy L Thompson                << var_suffix << ");\n";
451f815fac9SJeremy L Thompson           break;
452f815fac9SJeremy L Thompson         }
4533a2968d6SJeremy L Thompson         case CEED_RESTRICTION_POINTS: {
4543a2968d6SJeremy L Thompson           CeedInt comp_stride;
4553a2968d6SJeremy L Thompson 
4563a2968d6SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
457*0183ed61SJeremy L Thompson           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
4583a2968d6SJeremy L Thompson           data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
4593a2968d6SJeremy L Thompson           break;
4603a2968d6SJeremy L Thompson         }
461f815fac9SJeremy L Thompson         // LCOV_EXCL_START
462f815fac9SJeremy L Thompson         case CEED_RESTRICTION_ORIENTED:
463f815fac9SJeremy L Thompson         case CEED_RESTRICTION_CURL_ORIENTED:
464f815fac9SJeremy L Thompson           break;  // TODO: Not implemented
465f815fac9SJeremy L Thompson                   // LCOV_EXCL_STOP
4664b3e95d5SJeremy L Thompson       }
4674b3e95d5SJeremy L Thompson     }
4684b3e95d5SJeremy L Thompson   } else {
4694b3e95d5SJeremy L Thompson     // Output
470f815fac9SJeremy L Thompson     switch (rstr_type) {
471f815fac9SJeremy L Thompson       case CEED_RESTRICTION_STANDARD: {
4724b3e95d5SJeremy L Thompson         CeedInt comp_stride;
4734b3e95d5SJeremy L Thompson 
4744b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
475*0183ed61SJeremy L Thompson         code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
4764b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
477*0183ed61SJeremy L Thompson         code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
4784b3e95d5SJeremy L Thompson         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
479*0183ed61SJeremy L Thompson         code << tab << "WriteLVecStandard" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", "
480*0183ed61SJeremy L Thompson              << P_name << ">(data, l_size" << var_suffix << ", elem, indices.outputs[" << i << "], r_e" << var_suffix << ", d" << var_suffix
481*0183ed61SJeremy L Thompson              << ");\n";
482f815fac9SJeremy L Thompson         break;
483f815fac9SJeremy L Thompson       }
484f815fac9SJeremy L Thompson       case CEED_RESTRICTION_STRIDED: {
4854b3e95d5SJeremy L Thompson         bool    has_backend_strides;
4864b3e95d5SJeremy L Thompson         CeedInt num_elem;
4874b3e95d5SJeremy L Thompson 
4884b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
4894b3e95d5SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
4904b3e95d5SJeremy L Thompson         CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
4914b3e95d5SJeremy L Thompson 
4924b3e95d5SJeremy L Thompson         if (!has_backend_strides) {
4934b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
4944b3e95d5SJeremy L Thompson         }
495*0183ed61SJeremy L Thompson         code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
496*0183ed61SJeremy L Thompson              << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
497*0183ed61SJeremy L Thompson         code << tab << "WriteLVecStrided" << (is_all_tensor ? max_dim : 1) << "d<num_comp" << var_suffix << ", " << P_name << ", strides"
498*0183ed61SJeremy L Thompson              << var_suffix << "_0, strides" << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, r_e" << var_suffix << ", d" << var_suffix
499*0183ed61SJeremy L Thompson              << ");\n";
500f815fac9SJeremy L Thompson         break;
501f815fac9SJeremy L Thompson       }
5023a2968d6SJeremy L Thompson       case CEED_RESTRICTION_POINTS:
5033a2968d6SJeremy L Thompson         data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets;
5043a2968d6SJeremy L Thompson         break;
505f815fac9SJeremy L Thompson       // LCOV_EXCL_START
506f815fac9SJeremy L Thompson       case CEED_RESTRICTION_ORIENTED:
507f815fac9SJeremy L Thompson       case CEED_RESTRICTION_CURL_ORIENTED:
508f815fac9SJeremy L Thompson         break;  // TODO: Not implemented
509f815fac9SJeremy L Thompson                 // LCOV_EXCL_STOP
5104b3e95d5SJeremy L Thompson     }
5114b3e95d5SJeremy L Thompson   }
5123a2968d6SJeremy L Thompson   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
5134b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
5144b3e95d5SJeremy L Thompson }
5154b3e95d5SJeremy L Thompson 
5164b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
5174b3e95d5SJeremy L Thompson // Basis
5184b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
519*0183ed61SJeremy L Thompson static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt i, CeedOperatorField op_field,
52074398b5aSJeremy L Thompson                                                 CeedQFunctionField qf_field, CeedInt max_dim, CeedInt Q_1d, bool is_input, bool is_all_tensor,
5213a2968d6SJeremy L Thompson                                                 bool is_at_points, bool use_3d_slices) {
52274398b5aSJeremy L Thompson   bool      is_tensor = true;
52374398b5aSJeremy L Thompson   CeedBasis basis;
52474398b5aSJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetBasis(op_field, &basis));
52574398b5aSJeremy L Thompson   CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
52674398b5aSJeremy L Thompson 
5274b3e95d5SJeremy L Thompson   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
5289123fb08SJeremy L Thompson   std::string         P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
5294b3e95d5SJeremy L Thompson   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
53074398b5aSJeremy L Thompson   CeedInt             dim = max_dim, elem_size = 0, num_comp = 0, P_1d = 0;
5314b3e95d5SJeremy L Thompson   CeedElemRestriction elem_rstr;
5324b3e95d5SJeremy L Thompson 
5334b3e95d5SJeremy L Thompson   // Get field data
5344b3e95d5SJeremy L Thompson   CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_field, &elem_rstr));
5354b3e95d5SJeremy L Thompson   if (elem_rstr != CEED_ELEMRESTRICTION_NONE) {
5364b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
5374b3e95d5SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
5384b3e95d5SJeremy L Thompson   }
5393a2968d6SJeremy L Thompson   CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
5404b3e95d5SJeremy L Thompson   if (basis != CEED_BASIS_NONE) {
54174398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisGetDimension(basis, &dim));
5429123fb08SJeremy L Thompson     if (is_tensor) CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
5439123fb08SJeremy L Thompson     else CeedCallBackend(CeedBasisGetNumNodes(basis, &P_1d));
5444b3e95d5SJeremy L Thompson   }
5454b3e95d5SJeremy L Thompson   CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_field, &eval_mode));
5464b3e95d5SJeremy L Thompson 
5474b3e95d5SJeremy L Thompson   // Basis
548*0183ed61SJeremy L Thompson   code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
5494b3e95d5SJeremy L Thompson   if (is_input) {
5504b3e95d5SJeremy L Thompson     switch (eval_mode) {
5514b3e95d5SJeremy L Thompson       case CEED_EVAL_NONE:
5523a2968d6SJeremy L Thompson         if (!use_3d_slices && !is_at_points) {
553*0183ed61SJeremy L Thompson           code << tab << "CeedScalar *r_q" << var_suffix << " = r_e" << var_suffix << ";\n";
5544b3e95d5SJeremy L Thompson         }
5554b3e95d5SJeremy L Thompson         break;
5564b3e95d5SJeremy L Thompson       case CEED_EVAL_INTERP:
5573a2968d6SJeremy L Thompson         if (is_at_points) {
5589123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
5599123fb08SJeremy L Thompson 
560*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
561*0183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
5626b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
5633a2968d6SJeremy L Thompson         } else {
56474398b5aSJeremy L Thompson           std::string function_name = is_tensor
56574398b5aSJeremy L Thompson                                           ? ((dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
56674398b5aSJeremy L Thompson                                           : "InterpNonTensor";
56774398b5aSJeremy L Thompson           std::string op_t_1d_name  = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
5689123fb08SJeremy L Thompson 
569*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
570*0183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
57174398b5aSJeremy L Thompson                << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
5723a2968d6SJeremy L Thompson         }
5734b3e95d5SJeremy L Thompson         break;
5744b3e95d5SJeremy L Thompson       case CEED_EVAL_GRAD:
5753a2968d6SJeremy L Thompson         if (is_at_points) {
5769123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "Interp" : "InterpTensor") + std::to_string(dim) + "d";
5779123fb08SJeremy L Thompson 
578*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (dim >= 3 ? Q_name : "1") << "];\n";
579*0183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
5806b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_c" << var_suffix << ");\n";
5813a2968d6SJeremy L Thompson         } else if (use_3d_slices) {
5829123fb08SJeremy L Thompson           std::string function_name = (dim > 1 ? "InterpTensor" : "Interp") + std::to_string(dim) + "d";
5839123fb08SJeremy L Thompson 
584*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
585*0183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_e" << var_suffix
5866b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n";
5879123fb08SJeremy L Thompson         } else if (is_tensor) {
5889123fb08SJeremy L Thompson           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
58974398b5aSJeremy L Thompson           std::string function_name = (dim == 1 ? "Grad" : (is_collocated ? "GradTensorCollocated" : "GradTensor")) + std::to_string(dim) + "d" +
59074398b5aSJeremy L Thompson                                       (is_all_tensor ? "" : "Flattened");
59174398b5aSJeremy L Thompson           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
5929123fb08SJeremy L Thompson 
593*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
59474398b5aSJeremy L Thompson                << (is_all_tensor && dim >= 3 ? Q_name : "1") << "];\n";
595*0183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_e"
59674398b5aSJeremy L Thompson                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
5974b3e95d5SJeremy L Thompson         } else {
5989123fb08SJeremy L Thompson           std::string function_name = "GradNonTensor";
5999123fb08SJeremy L Thompson 
600*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
601*0183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
60274398b5aSJeremy L Thompson                << ", OP_T_1D>(data, r_e" << var_suffix << ", s_G" << var_suffix << ", r_q" << var_suffix << ");\n";
6034b3e95d5SJeremy L Thompson         }
6044b3e95d5SJeremy L Thompson         break;
6054b3e95d5SJeremy L Thompson       case CEED_EVAL_WEIGHT: {
6063a2968d6SJeremy L Thompson         if (is_at_points) {
607*0183ed61SJeremy L Thompson           code << tab << "// Nothing to do AtPoints\n";
6083a2968d6SJeremy L Thompson         } else {
6094b3e95d5SJeremy L Thompson           CeedBasis_Hip_shared *basis_data;
61074398b5aSJeremy L Thompson           std::string           function_name = is_tensor
61174398b5aSJeremy L Thompson                                                     ? ((dim == 1 ? "Weight" : "WeightTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
61274398b5aSJeremy L Thompson                                                     : "WeightNonTensor";
6134b3e95d5SJeremy L Thompson 
614*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[" << (is_all_tensor && (dim >= 3) ? Q_name : "1") << "];\n";
6154b3e95d5SJeremy L Thompson           CeedCallBackend(CeedBasisGetData(basis, &basis_data));
6164b3e95d5SJeremy L Thompson           data->W = basis_data->d_q_weight_1d;
617*0183ed61SJeremy L Thompson           code << tab << function_name << "<" << P_name << ", " << Q_name << ">(data, W, r_q" << var_suffix << ");\n";
6183a2968d6SJeremy L Thompson         }
6194b3e95d5SJeremy L Thompson         break;
6204b3e95d5SJeremy L Thompson       }
6214b3e95d5SJeremy L Thompson       // LCOV_EXCL_START
6224b3e95d5SJeremy L Thompson       case CEED_EVAL_DIV:
6234b3e95d5SJeremy L Thompson       case CEED_EVAL_CURL:
6244b3e95d5SJeremy L Thompson         break;  // TODO: Not implemented
6254b3e95d5SJeremy L Thompson                 // LCOV_EXCL_STOP
6264b3e95d5SJeremy L Thompson     }
6274b3e95d5SJeremy L Thompson   } else {
6284b3e95d5SJeremy L Thompson     switch (eval_mode) {
6294b3e95d5SJeremy L Thompson       case CEED_EVAL_NONE:
630*0183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
6314b3e95d5SJeremy L Thompson         break;  // No action
6324b3e95d5SJeremy L Thompson       case CEED_EVAL_INTERP:
633*0183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
6343a2968d6SJeremy L Thompson         if (is_at_points) {
6359123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
6369123fb08SJeremy L Thompson 
637*0183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
6386b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
6393a2968d6SJeremy L Thompson         } else {
6409123fb08SJeremy L Thompson           std::string function_name =
64174398b5aSJeremy L Thompson               is_tensor ? ((dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened"))
64274398b5aSJeremy L Thompson                         : "InterpTransposeNonTensor";
64374398b5aSJeremy L Thompson           std::string op_t_1d_name = (is_all_tensor || !is_tensor) ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
6449123fb08SJeremy L Thompson 
645*0183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
64674398b5aSJeremy L Thompson                << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
6473a2968d6SJeremy L Thompson         }
6484b3e95d5SJeremy L Thompson         break;
6494b3e95d5SJeremy L Thompson       case CEED_EVAL_GRAD:
650*0183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
6513a2968d6SJeremy L Thompson         if (is_at_points) {
6529123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
6539123fb08SJeremy L Thompson 
654*0183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_c" << var_suffix
6556b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
6563a2968d6SJeremy L Thompson         } else if (use_3d_slices) {
6579123fb08SJeremy L Thompson           std::string function_name = (dim == 1 ? "InterpTranspose" : "InterpTransposeTensor") + std::to_string(dim) + "d";
6589123fb08SJeremy L Thompson 
659*0183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", OP_T_1D>(data, r_q" << var_suffix
6606b92dc4bSJeremy L Thompson                << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
6619123fb08SJeremy L Thompson         } else if (is_tensor) {
6629123fb08SJeremy L Thompson           bool        is_collocated = dim == 3 && Q_1d >= P_1d;
66374398b5aSJeremy L Thompson           std::string function_name = (dim == 1 ? "GradTranspose" : (is_collocated ? "GradTransposeTensorCollocated" : "GradTransposeTensor")) +
66474398b5aSJeremy L Thompson                                       std::to_string(dim) + "d" + (is_all_tensor ? "" : "Flattened");
66574398b5aSJeremy L Thompson           std::string op_t_1d_name = is_all_tensor ? "OP_T_1D" : (P_1d > Q_1d ? P_name : Q_name);
6669123fb08SJeremy L Thompson 
667*0183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", " << P_name << ", " << Q_name << ", " << op_t_1d_name << ">(data, r_q"
66874398b5aSJeremy L Thompson                << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
6694b3e95d5SJeremy L Thompson         } else {
6709123fb08SJeremy L Thompson           std::string function_name = "GradTransposeNonTensor";
6719123fb08SJeremy L Thompson 
672*0183ed61SJeremy L Thompson           code << tab << function_name << "<num_comp" << var_suffix << ", dim" << var_suffix << ", " << P_name << ", " << Q_name
67374398b5aSJeremy L Thompson                << ", OP_T_1D>(data, r_q" << var_suffix << ", s_G" << var_suffix << ", r_e" << var_suffix << ");\n";
6744b3e95d5SJeremy L Thompson         }
6754b3e95d5SJeremy L Thompson         break;
6764b3e95d5SJeremy L Thompson       // LCOV_EXCL_START
6774b3e95d5SJeremy L Thompson       case CEED_EVAL_WEIGHT:
6784b3e95d5SJeremy L Thompson         break;  // Should not occur
6794b3e95d5SJeremy L Thompson       case CEED_EVAL_DIV:
6804b3e95d5SJeremy L Thompson       case CEED_EVAL_CURL:
6814b3e95d5SJeremy L Thompson         break;  // TODO: Not implemented
6824b3e95d5SJeremy L Thompson                 // LCOV_EXCL_STOP
6834b3e95d5SJeremy L Thompson     }
6844b3e95d5SJeremy L Thompson   }
6853a2968d6SJeremy L Thompson   CeedCallBackend(CeedBasisDestroy(&basis));
6864b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
6874b3e95d5SJeremy L Thompson }
6884b3e95d5SJeremy L Thompson 
6894b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
6904b3e95d5SJeremy L Thompson // QFunction
6914b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
692*0183ed61SJeremy L Thompson static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, Tab &tab, CeedInt max_dim,
693*0183ed61SJeremy L Thompson                                                     CeedInt max_num_points, CeedInt num_input_fields, CeedOperatorField *op_input_fields,
694*0183ed61SJeremy L Thompson                                                     CeedQFunctionField *qf_input_fields, CeedInt num_output_fields,
695*0183ed61SJeremy L Thompson                                                     CeedOperatorField *op_output_fields, CeedQFunctionField *qf_output_fields,
696*0183ed61SJeremy L Thompson                                                     std::string qfunction_name, CeedInt Q_1d, bool is_all_tensor, bool is_at_points,
697*0183ed61SJeremy L Thompson                                                     bool use_3d_slices) {
69874398b5aSJeremy L Thompson   std::string         Q_name    = is_all_tensor ? "Q_1d" : "Q";
6994b3e95d5SJeremy L Thompson   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
7004b3e95d5SJeremy L Thompson   CeedElemRestriction elem_rstr;
7014b3e95d5SJeremy L Thompson 
7028b97b69aSJeremy L Thompson   // Setup output arrays
703*0183ed61SJeremy L Thompson   code << "\n";
704*0183ed61SJeremy L Thompson   code << tab << "// -- Output field setup\n";
7054b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
70659fa3f92SJeremy L Thompson     const char *field_name;
7074b3e95d5SJeremy L Thompson     std::string var_suffix = "_out_" + std::to_string(i);
7084b3e95d5SJeremy L Thompson 
70959fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
710*0183ed61SJeremy L Thompson     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
7114b3e95d5SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
7123a2968d6SJeremy L Thompson     switch (eval_mode) {
7133a2968d6SJeremy L Thompson       case CEED_EVAL_NONE:
7143a2968d6SJeremy L Thompson         if (is_at_points) {
715*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "];\n";
7163a2968d6SJeremy L Thompson         } else {
717*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
71874398b5aSJeremy L Thompson                << "];\n";
7194b3e95d5SJeremy L Thompson         }
7203a2968d6SJeremy L Thompson         break;
7213a2968d6SJeremy L Thompson       case CEED_EVAL_INTERP:
7223a2968d6SJeremy L Thompson         if (is_at_points) {
7233a2968d6SJeremy L Thompson           // Accumulator for point data
724*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
725*0183ed61SJeremy L Thompson           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
726b8245c6cSJeremy L Thompson                << "[i] = 0.0;\n";
7273a2968d6SJeremy L Thompson         } else {
728*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << (is_all_tensor && (max_dim >= 3) ? Q_name : "1")
72974398b5aSJeremy L Thompson                << "];\n";
7303a2968d6SJeremy L Thompson         }
7313a2968d6SJeremy L Thompson         break;
7323a2968d6SJeremy L Thompson       case CEED_EVAL_GRAD:
7333a2968d6SJeremy L Thompson         if (is_at_points) {
7343a2968d6SJeremy L Thompson           // Accumulator for point data
735*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_c" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "];\n";
736*0183ed61SJeremy L Thompson           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << (max_dim >= 3 ? Q_name : "1") << "; i++) r_c" << var_suffix
737b8245c6cSJeremy L Thompson                << "[i] = 0.0;\n";
7383a2968d6SJeremy L Thompson         } else if (use_3d_slices) {
7394b3e95d5SJeremy L Thompson           // Accumulator for gradient slices
740*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n";
741*0183ed61SJeremy L Thompson           code << tab << "for (CeedInt i = 0; i < num_comp" << var_suffix << "*" << Q_name << "; i++) r_q" << var_suffix << "[i] = 0.0;\n";
7424b3e95d5SJeremy L Thompson         } else {
743*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "*"
74474398b5aSJeremy L Thompson                << (is_all_tensor && (max_dim >= 3) ? Q_name : "1") << "];\n";
7454b3e95d5SJeremy L Thompson         }
7463a2968d6SJeremy L Thompson         break;
7473a2968d6SJeremy L Thompson       case CEED_EVAL_WEIGHT:
7483a2968d6SJeremy L Thompson         break;
7493a2968d6SJeremy L Thompson         // LCOV_EXCL_START
7503a2968d6SJeremy L Thompson       case CEED_EVAL_DIV:
7513a2968d6SJeremy L Thompson       case CEED_EVAL_CURL:
7523a2968d6SJeremy L Thompson         break;  // TODO: Not implemented
7533a2968d6SJeremy L Thompson                 // LCOV_EXCL_STOP
7544b3e95d5SJeremy L Thompson     }
7554b3e95d5SJeremy L Thompson   }
7564b3e95d5SJeremy L Thompson 
7573a2968d6SJeremy L Thompson   if (is_at_points) {
7583a2968d6SJeremy L Thompson     // We need to handle batches of points
759*0183ed61SJeremy L Thompson     code << "\n";
760*0183ed61SJeremy L Thompson     code << tab << "// Note: Using batches of points\n";
761*0183ed61SJeremy L Thompson     code << tab << "const CeedInt point_loop_bound = (blockDim.x*blockDim.y) * ceil((1.0*max_num_points) / (blockDim.x*blockDim.y));\n\n";
762*0183ed61SJeremy L Thompson     code << tab << "#pragma unroll\n";
763*0183ed61SJeremy L Thompson     code << tab << "for (CeedInt i = threadIdx.x + threadIdx.y*blockDim.x; i < point_loop_bound; i += blockDim.x*blockDim.y) {\n";
764*0183ed61SJeremy L Thompson     tab.push();
765*0183ed61SJeremy L Thompson     code << tab << "const CeedInt p = i % max_num_points;\n\n";
7663a2968d6SJeremy L Thompson 
767*0183ed61SJeremy L Thompson     code << tab << "// -- Coordinates\n";
768*0183ed61SJeremy L Thompson     code << tab << "CeedScalar r_x[max_dim];\n";
769*0183ed61SJeremy L Thompson     code << tab << "ReadPoint<max_dim, coords_comp_stride, max_num_points>(data, elem, p, max_num_points, points.indices, points.coords, r_x);\n\n";
7703a2968d6SJeremy L Thompson 
771*0183ed61SJeremy L Thompson     code << tab << "// -- Input fields\n";
7723a2968d6SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
77359fa3f92SJeremy L Thompson       const char *field_name;
7743a2968d6SJeremy L Thompson       std::string var_suffix = "_in_" + std::to_string(i);
775f725b54bSJeremy L Thompson       std::string P_name     = "P_1d" + var_suffix;
7763a2968d6SJeremy L Thompson 
77759fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
778*0183ed61SJeremy L Thompson       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
7793a2968d6SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
7803a2968d6SJeremy L Thompson       // Basis action
781*0183ed61SJeremy L Thompson       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
7823a2968d6SJeremy L Thompson       switch (eval_mode) {
7833a2968d6SJeremy L Thompson         case CEED_EVAL_NONE:
784*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
785*0183ed61SJeremy L Thompson           code << tab << "ReadPoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
7863a2968d6SJeremy L Thompson                << ", max_num_points>(data, elem, p, max_num_points, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
7873a2968d6SJeremy L Thompson           break;
7883a2968d6SJeremy L Thompson         case CEED_EVAL_INTERP:
789*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
790*0183ed61SJeremy L Thompson           code << tab << "InterpAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
79174398b5aSJeremy L Thompson                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
7923a2968d6SJeremy L Thompson           break;
7933a2968d6SJeremy L Thompson         case CEED_EVAL_GRAD:
794*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
795*0183ed61SJeremy L Thompson           code << tab << "GradAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
79674398b5aSJeremy L Thompson                << ">(data, i, r_c" << var_suffix << ", r_x, r_s" << var_suffix << ");\n";
7973a2968d6SJeremy L Thompson           break;
7983a2968d6SJeremy L Thompson         case CEED_EVAL_WEIGHT:
799*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
800*0183ed61SJeremy L Thompson           code << tab << "r_s" << var_suffix << "[0] = 1.0;\n";
8013a2968d6SJeremy L Thompson           break;
8023a2968d6SJeremy L Thompson           // LCOV_EXCL_START
8033a2968d6SJeremy L Thompson         case CEED_EVAL_DIV:
8043a2968d6SJeremy L Thompson         case CEED_EVAL_CURL:
8053a2968d6SJeremy L Thompson           break;  // TODO: Not implemented
8063a2968d6SJeremy L Thompson                   // LCOV_EXCL_STOP
8073a2968d6SJeremy L Thompson       }
8083a2968d6SJeremy L Thompson     }
809*0183ed61SJeremy L Thompson     code << "\n";
810*0183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
8113a2968d6SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
81259fa3f92SJeremy L Thompson       const char *field_name;
8133a2968d6SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
8143a2968d6SJeremy L Thompson 
81559fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
816*0183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
8173a2968d6SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
8183a2968d6SJeremy L Thompson       // Basis action
8193a2968d6SJeremy L Thompson       switch (eval_mode) {
8203a2968d6SJeremy L Thompson         case CEED_EVAL_NONE:
821*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
8223a2968d6SJeremy L Thompson           break;
8233a2968d6SJeremy L Thompson         case CEED_EVAL_INTERP:
824*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
8253a2968d6SJeremy L Thompson           break;
8263a2968d6SJeremy L Thompson         case CEED_EVAL_GRAD:
827*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
8283a2968d6SJeremy L Thompson           break;
8293a2968d6SJeremy L Thompson           // LCOV_EXCL_START
8303a2968d6SJeremy L Thompson         case CEED_EVAL_WEIGHT:
8313a2968d6SJeremy L Thompson           break;  // Should not occur
8323a2968d6SJeremy L Thompson         case CEED_EVAL_DIV:
8333a2968d6SJeremy L Thompson         case CEED_EVAL_CURL:
8343a2968d6SJeremy L Thompson           break;  // TODO: Not implemented
8353a2968d6SJeremy L Thompson                   // LCOV_EXCL_STOP
8363a2968d6SJeremy L Thompson       }
8373a2968d6SJeremy L Thompson     }
8383a2968d6SJeremy L Thompson 
8393a2968d6SJeremy L Thompson   } else if (use_3d_slices) {
8404b3e95d5SJeremy L Thompson     // We treat quadrature points per slice in 3d to save registers
841*0183ed61SJeremy L Thompson     code << "\n";
842*0183ed61SJeremy L Thompson     code << tab << "// Note: Using planes of 3D elements\n";
843*0183ed61SJeremy L Thompson     code << tab << "#pragma unroll\n";
844*0183ed61SJeremy L Thompson     code << tab << "for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
845*0183ed61SJeremy L Thompson     tab.push();
846*0183ed61SJeremy L Thompson     code << tab << "// -- Input fields\n";
8474b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
84859fa3f92SJeremy L Thompson       const char *field_name;
8494b3e95d5SJeremy L Thompson       std::string var_suffix = "_in_" + std::to_string(i);
8504b3e95d5SJeremy L Thompson 
85159fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
852*0183ed61SJeremy L Thompson       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
8534b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
8544b3e95d5SJeremy L Thompson       // Basis action
855*0183ed61SJeremy L Thompson       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
8564b3e95d5SJeremy L Thompson       switch (eval_mode) {
8574b3e95d5SJeremy L Thompson         case CEED_EVAL_NONE:
8584b3e95d5SJeremy L Thompson           bool is_strided;
8594b3e95d5SJeremy L Thompson 
860*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
8614b3e95d5SJeremy L Thompson 
8624b3e95d5SJeremy L Thompson           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
8634b3e95d5SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
8644b3e95d5SJeremy L Thompson           if (is_strided) {
8654b3e95d5SJeremy L Thompson             bool    has_backend_strides;
8664b3e95d5SJeremy L Thompson             CeedInt num_elem, elem_size;
8674b3e95d5SJeremy L Thompson 
8684b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
8694b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionHasBackendStrides(elem_rstr, &has_backend_strides));
8704b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetNumElements(elem_rstr, &num_elem));
8714b3e95d5SJeremy L Thompson             CeedInt strides[3] = {1, elem_size * num_elem, elem_size};
8724b3e95d5SJeremy L Thompson 
8734b3e95d5SJeremy L Thompson             if (!has_backend_strides) {
8744b3e95d5SJeremy L Thompson               CeedCallBackend(CeedElemRestrictionGetStrides(elem_rstr, strides));
8754b3e95d5SJeremy L Thompson             }
876*0183ed61SJeremy L Thompson             code << tab << "const CeedInt strides" << var_suffix << "_0 = " << strides[0] << ", strides" << var_suffix << "_1 = " << strides[1]
877*0183ed61SJeremy L Thompson                  << ", strides" << var_suffix << "_2 = " << strides[2] << ";\n";
878*0183ed61SJeremy L Thompson             code << tab << "ReadEVecSliceStrided3d<num_comp" << var_suffix << ", " << Q_name << ", strides" << var_suffix << "_0, strides"
879*0183ed61SJeremy L Thompson                  << var_suffix << "_1, strides" << var_suffix << "_2>(data, elem, q, d" << var_suffix << ", r_s" << var_suffix << ");\n";
8804b3e95d5SJeremy L Thompson           } else {
8814b3e95d5SJeremy L Thompson             CeedSize                 l_size = 0;
8824b3e95d5SJeremy L Thompson             CeedInt                  comp_stride;
8834b3e95d5SJeremy L Thompson             CeedElemRestriction_Hip *rstr_data;
8844b3e95d5SJeremy L Thompson 
8854b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
886*0183ed61SJeremy L Thompson             code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
8874b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
888*0183ed61SJeremy L Thompson             code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
8894b3e95d5SJeremy L Thompson             CeedCallBackend(CeedElemRestrictionGetData(elem_rstr, &rstr_data));
8904b3e95d5SJeremy L Thompson             data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets;
891*0183ed61SJeremy L Thompson             code << tab << "ReadEVecSliceStandard3d<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", " << Q_name << ">(data, l_size"
892*0183ed61SJeremy L Thompson                  << var_suffix << ", elem, q, indices.inputs[" << i << "], d" << var_suffix << ", r_s" << var_suffix << ");\n";
8934b3e95d5SJeremy L Thompson           }
8949123fb08SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
8954b3e95d5SJeremy L Thompson           break;
8964b3e95d5SJeremy L Thompson         case CEED_EVAL_INTERP:
897*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
898*0183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) {\n";
899*0183ed61SJeremy L Thompson           tab.push();
900*0183ed61SJeremy L Thompson           code << tab << "r_s" << var_suffix << "[j] = r_q" << var_suffix << "[q + j*" << Q_name << "];\n";
901*0183ed61SJeremy L Thompson           tab.pop();
902*0183ed61SJeremy L Thompson           code << tab << "}\n";
9034b3e95d5SJeremy L Thompson           break;
9044b3e95d5SJeremy L Thompson         case CEED_EVAL_GRAD:
905*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
906*0183ed61SJeremy L Thompson           code << tab << "GradColloSlice3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_q" << var_suffix << ", s_G"
9076b92dc4bSJeremy L Thompson                << var_suffix << ", r_s" << var_suffix << ");\n";
9084b3e95d5SJeremy L Thompson           break;
9094b3e95d5SJeremy L Thompson         case CEED_EVAL_WEIGHT:
910*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[1];\n";
911*0183ed61SJeremy L Thompson           code << tab << "r_s" << var_suffix << "[0] = r_q" << var_suffix << "[q];\n";
9123a2968d6SJeremy L Thompson           break;
9134b3e95d5SJeremy L Thompson           // LCOV_EXCL_START
9144b3e95d5SJeremy L Thompson         case CEED_EVAL_DIV:
9154b3e95d5SJeremy L Thompson         case CEED_EVAL_CURL:
9164b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
9174b3e95d5SJeremy L Thompson                   // LCOV_EXCL_STOP
9184b3e95d5SJeremy L Thompson       }
9194b3e95d5SJeremy L Thompson     }
920*0183ed61SJeremy L Thompson     code << "\n";
921*0183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
9224b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
92359fa3f92SJeremy L Thompson       const char *field_name;
9244b3e95d5SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
9254b3e95d5SJeremy L Thompson 
92659fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
927*0183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
9284b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
9294b3e95d5SJeremy L Thompson       // Basis action
9304b3e95d5SJeremy L Thompson       switch (eval_mode) {
9314b3e95d5SJeremy L Thompson         case CEED_EVAL_NONE:
932*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
9333a2968d6SJeremy L Thompson           break;
9344b3e95d5SJeremy L Thompson         case CEED_EVAL_INTERP:
935*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "];\n";
9364b3e95d5SJeremy L Thompson           break;
9374b3e95d5SJeremy L Thompson         case CEED_EVAL_GRAD:
938*0183ed61SJeremy L Thompson           code << tab << "CeedScalar r_s" << var_suffix << "[num_comp" << var_suffix << "*dim" << var_suffix << "];\n";
9394b3e95d5SJeremy L Thompson           break;
9404b3e95d5SJeremy L Thompson           // LCOV_EXCL_START
9414b3e95d5SJeremy L Thompson         case CEED_EVAL_WEIGHT:
9424b3e95d5SJeremy L Thompson           break;  // Should not occur
9434b3e95d5SJeremy L Thompson         case CEED_EVAL_DIV:
9444b3e95d5SJeremy L Thompson         case CEED_EVAL_CURL:
9454b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
9464b3e95d5SJeremy L Thompson                   // LCOV_EXCL_STOP
9474b3e95d5SJeremy L Thompson       }
9484b3e95d5SJeremy L Thompson     }
9494b3e95d5SJeremy L Thompson   } else {
950*0183ed61SJeremy L Thompson     code << "\n";
951*0183ed61SJeremy L Thompson     code << tab << "// Note: Using full elements\n";
952*0183ed61SJeremy L Thompson     code << tab << "{\n";
953*0183ed61SJeremy L Thompson     tab.push();
954*0183ed61SJeremy L Thompson     code << tab << "// -- Input fields\n";
9554b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
95659fa3f92SJeremy L Thompson       const char *field_name;
95759fa3f92SJeremy L Thompson 
95859fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
959*0183ed61SJeremy L Thompson       code << tab << "// ---- Input field " << i << ": " << field_name << "\n";
960*0183ed61SJeremy L Thompson       code << tab << "CeedScalar *r_s_in_" << i << " = r_q_in_" << i << ";\n";
9614b3e95d5SJeremy L Thompson     }
962*0183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
9634b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
96459fa3f92SJeremy L Thompson       const char *field_name;
96559fa3f92SJeremy L Thompson 
96659fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
967*0183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
968*0183ed61SJeremy L Thompson       code << tab << "CeedScalar *r_s_out_" << i << " = r_q_out_" << i << ";\n";
9694b3e95d5SJeremy L Thompson     }
9704b3e95d5SJeremy L Thompson   }
9714b3e95d5SJeremy L Thompson 
9724b3e95d5SJeremy L Thompson   // Input and output buffers
973*0183ed61SJeremy L Thompson   code << "\n";
974*0183ed61SJeremy L Thompson   code << tab << "// -- QFunction inputs and outputs\n";
975*0183ed61SJeremy L Thompson   code << tab << "// ---- Inputs\n";
976*0183ed61SJeremy L Thompson   code << tab << "CeedScalar *inputs[" << CeedIntMax(num_input_fields, 1) << "];\n";
9774b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
97859fa3f92SJeremy L Thompson     const char *field_name;
97959fa3f92SJeremy L Thompson 
98059fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[i], &field_name));
981*0183ed61SJeremy L Thompson     code << tab << "// ------ Input field " << i << ": " << field_name << "\n";
982*0183ed61SJeremy L Thompson     code << tab << "inputs[" << i << "] = r_s_in_" << i << ";\n";
9834b3e95d5SJeremy L Thompson   }
984*0183ed61SJeremy L Thompson   code << tab << "// ---- Outputs\n";
985*0183ed61SJeremy L Thompson   code << tab << "CeedScalar *outputs[" << CeedIntMax(num_output_fields, 1) << "];\n";
9864b3e95d5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
98759fa3f92SJeremy L Thompson     const char *field_name;
98859fa3f92SJeremy L Thompson 
98959fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
990*0183ed61SJeremy L Thompson     code << tab << "// ------ Output field " << i << ": " << field_name << "\n";
991*0183ed61SJeremy L Thompson     code << tab << "outputs[" << i << "] = r_s_out_" << i << ";\n";
9924b3e95d5SJeremy L Thompson   }
9934b3e95d5SJeremy L Thompson 
9944b3e95d5SJeremy L Thompson   // Apply QFunction
995*0183ed61SJeremy L Thompson   code << "\n";
996*0183ed61SJeremy L Thompson   code << tab << "// -- Apply QFunction\n";
997*0183ed61SJeremy L Thompson   code << tab << "" << qfunction_name << "(ctx, ";
99874398b5aSJeremy L Thompson   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
9994b3e95d5SJeremy L Thompson     code << "1";
10004b3e95d5SJeremy L Thompson   } else {
10019123fb08SJeremy L Thompson     code << Q_name;
10024b3e95d5SJeremy L Thompson   }
10034b3e95d5SJeremy L Thompson   code << ", inputs, outputs);\n";
10044b3e95d5SJeremy L Thompson 
10053a2968d6SJeremy L Thompson   if (is_at_points) {
10063a2968d6SJeremy L Thompson     // Map back to coefficients
1007*0183ed61SJeremy L Thompson     code << "\n";
1008*0183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
10093a2968d6SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
101059fa3f92SJeremy L Thompson       const char *field_name;
10113a2968d6SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
10123a2968d6SJeremy L Thompson       std::string P_name     = "P_1d" + var_suffix;
10133a2968d6SJeremy L Thompson 
101459fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1015*0183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
10163a2968d6SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
10173a2968d6SJeremy L Thompson       // Basis action
1018*0183ed61SJeremy L Thompson       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
10193a2968d6SJeremy L Thompson       switch (eval_mode) {
10203a2968d6SJeremy L Thompson         case CEED_EVAL_NONE: {
10213a2968d6SJeremy L Thompson           CeedInt             comp_stride;
10223a2968d6SJeremy L Thompson           CeedElemRestriction elem_rstr;
10233a2968d6SJeremy L Thompson 
10243a2968d6SJeremy L Thompson           CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
10253a2968d6SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
10263a2968d6SJeremy L Thompson           CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1027*0183ed61SJeremy L Thompson           code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
1028*0183ed61SJeremy L Thompson           code << tab << "WritePoint<num_comp" << var_suffix << ", comp_stride" << var_suffix
10293a2968d6SJeremy L Thompson                << ", max_num_points>(data, elem, i, points.num_per_elem[elem], indices.outputs[" << i << "]"
10303a2968d6SJeremy L Thompson                << ", r_s" << var_suffix << ", d" << var_suffix << ");\n";
10313a2968d6SJeremy L Thompson           break;
10323a2968d6SJeremy L Thompson         }
10333a2968d6SJeremy L Thompson         case CEED_EVAL_INTERP:
1034*0183ed61SJeremy L Thompson           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1035*0183ed61SJeremy L Thompson           tab.push();
1036*0183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1037*0183ed61SJeremy L Thompson           tab.pop();
1038*0183ed61SJeremy L Thompson           code << tab << "}\n";
1039*0183ed61SJeremy L Thompson           code << tab << "InterpTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1040f725b54bSJeremy L Thompson                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
10413a2968d6SJeremy L Thompson           break;
10423a2968d6SJeremy L Thompson         case CEED_EVAL_GRAD:
1043*0183ed61SJeremy L Thompson           code << tab << "if (i >= points.num_per_elem[elem]) {\n";
1044*0183ed61SJeremy L Thompson           tab.push();
1045*0183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << "*dim" << var_suffix << "; j++) r_s" << var_suffix << "[j] = 0.0;\n";
1046*0183ed61SJeremy L Thompson           tab.pop();
1047*0183ed61SJeremy L Thompson           code << tab << "}\n";
1048*0183ed61SJeremy L Thompson           code << tab << "GradTransposeAtPoints" << max_dim << "d<num_comp" << var_suffix << ", max_num_points, " << P_name << ", " << Q_name
1049f725b54bSJeremy L Thompson                << ">(data, i, r_s" << var_suffix << ", r_x, r_c" << var_suffix << ");\n";
10503a2968d6SJeremy L Thompson           break;
10513a2968d6SJeremy L Thompson           // LCOV_EXCL_START
10523a2968d6SJeremy L Thompson         case CEED_EVAL_WEIGHT:
10533a2968d6SJeremy L Thompson           break;  // Should not occur
10543a2968d6SJeremy L Thompson         case CEED_EVAL_DIV:
10553a2968d6SJeremy L Thompson         case CEED_EVAL_CURL:
10563a2968d6SJeremy L Thompson           break;  // TODO: Not implemented
10573a2968d6SJeremy L Thompson                   // LCOV_EXCL_STOP
10583a2968d6SJeremy L Thompson       }
10593a2968d6SJeremy L Thompson     }
10603a2968d6SJeremy L Thompson   } else if (use_3d_slices) {
10614b3e95d5SJeremy L Thompson     // Copy or apply transpose grad, if needed
1062*0183ed61SJeremy L Thompson     code << "\n";
1063*0183ed61SJeremy L Thompson     code << tab << "// -- Output fields\n";
10644b3e95d5SJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
106559fa3f92SJeremy L Thompson       const char *field_name;
10664b3e95d5SJeremy L Thompson       std::string var_suffix = "_out_" + std::to_string(i);
10674b3e95d5SJeremy L Thompson       std::string P_name     = "P_1d" + var_suffix;
10684b3e95d5SJeremy L Thompson 
106959fa3f92SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1070*0183ed61SJeremy L Thompson       code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
10714b3e95d5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
10724b3e95d5SJeremy L Thompson       // Basis action
1073*0183ed61SJeremy L Thompson       code << tab << "// EvalMode: " << CeedEvalModes[eval_mode] << "\n";
10744b3e95d5SJeremy L Thompson       switch (eval_mode) {
10754b3e95d5SJeremy L Thompson         case CEED_EVAL_NONE:
1076*0183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1077*0183ed61SJeremy L Thompson           tab.push();
1078*0183ed61SJeremy L Thompson           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1079*0183ed61SJeremy L Thompson           tab.pop();
1080*0183ed61SJeremy L Thompson           code << tab << "}\n";
10813a2968d6SJeremy L Thompson           break;
10824b3e95d5SJeremy L Thompson         case CEED_EVAL_INTERP:
1083*0183ed61SJeremy L Thompson           code << tab << "for (CeedInt j = 0; j < num_comp" << var_suffix << " ; j++) {\n";
1084*0183ed61SJeremy L Thompson           tab.push();
1085*0183ed61SJeremy L Thompson           code << tab << "r_q" << var_suffix << "[q + j*" << Q_name << "] = r_s" << var_suffix << "[j];\n";
1086*0183ed61SJeremy L Thompson           tab.pop();
1087*0183ed61SJeremy L Thompson           code << tab << "}\n";
10884b3e95d5SJeremy L Thompson           break;
10894b3e95d5SJeremy L Thompson         case CEED_EVAL_GRAD:
1090*0183ed61SJeremy L Thompson           code << tab << "GradColloSliceTranspose3d<num_comp" << var_suffix << ", " << Q_name << ", OP_T_1D>(data, q, r_s" << var_suffix << ", s_G"
1091f815fac9SJeremy L Thompson                << var_suffix << ", r_q" << var_suffix << ");\n";
10924b3e95d5SJeremy L Thompson           break;
10934b3e95d5SJeremy L Thompson           // LCOV_EXCL_START
10944b3e95d5SJeremy L Thompson         case CEED_EVAL_WEIGHT:
10954b3e95d5SJeremy L Thompson           break;  // Should not occur
10964b3e95d5SJeremy L Thompson         case CEED_EVAL_DIV:
10974b3e95d5SJeremy L Thompson         case CEED_EVAL_CURL:
10984b3e95d5SJeremy L Thompson           break;  // TODO: Not implemented
10994b3e95d5SJeremy L Thompson                   // LCOV_EXCL_STOP
11004b3e95d5SJeremy L Thompson       }
11014b3e95d5SJeremy L Thompson     }
11024b3e95d5SJeremy L Thompson   }
1103*0183ed61SJeremy L Thompson   tab.pop();
1104*0183ed61SJeremy L Thompson   code << tab << "}\n";
11054b3e95d5SJeremy L Thompson   return CEED_ERROR_SUCCESS;
11064b3e95d5SJeremy L Thompson }
11074b3e95d5SJeremy L Thompson 
11084b3e95d5SJeremy L Thompson //------------------------------------------------------------------------------
11099e201c85SYohann // Build single operator kernel
11107d8d0e25Snbeams //------------------------------------------------------------------------------
11118d12f40eSJeremy L Thompson extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
111274398b5aSJeremy L Thompson   bool                   is_all_tensor = true, is_all_nontensor = true, is_at_points = false, use_3d_slices = false;
11137d8d0e25Snbeams   Ceed                   ceed;
1114efa41df3SJeremy L Thompson   CeedInt                Q = 0, Q_1d = 0, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1115b7453713SJeremy L Thompson   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1116b7453713SJeremy L Thompson   CeedQFunction_Hip_gen *qf_data;
1117b7453713SJeremy L Thompson   CeedQFunction          qf;
1118b7453713SJeremy L Thompson   CeedOperatorField     *op_input_fields, *op_output_fields;
1119b7453713SJeremy L Thompson   CeedOperator_Hip_gen  *data;
11204b3e95d5SJeremy L Thompson   std::ostringstream     code;
1121*0183ed61SJeremy L Thompson   Tab                    tab;
11224b3e95d5SJeremy L Thompson 
11238d12f40eSJeremy L Thompson   CeedCallBackend(CeedOperatorGetData(op, &data));
11244b3e95d5SJeremy L Thompson   {
11254b3e95d5SJeremy L Thompson     bool is_setup_done;
1126b7453713SJeremy L Thompson 
1127b7453713SJeremy L Thompson     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
11288d12f40eSJeremy L Thompson     if (is_setup_done) {
11298d12f40eSJeremy L Thompson       *is_good_build = !data->use_fallback;
11308d12f40eSJeremy L Thompson       return CEED_ERROR_SUCCESS;
11318d12f40eSJeremy L Thompson     }
11324b3e95d5SJeremy L Thompson   }
1133b7453713SJeremy L Thompson 
11348d12f40eSJeremy L Thompson   // Check field compatibility
11358d12f40eSJeremy L Thompson   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
11368d12f40eSJeremy L Thompson   {
113774398b5aSJeremy L Thompson     bool has_shared_bases = true;
11388d12f40eSJeremy L Thompson 
11398d12f40eSJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
11408d12f40eSJeremy L Thompson       CeedBasis basis;
11418d12f40eSJeremy L Thompson 
11428d12f40eSJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
11438d12f40eSJeremy L Thompson       if (basis != CEED_BASIS_NONE) {
11448d12f40eSJeremy L Thompson         bool        is_tensor = true;
11458d12f40eSJeremy L Thompson         const char *resource;
11468d12f40eSJeremy L Thompson         char       *resource_root;
11478d12f40eSJeremy L Thompson         Ceed        basis_ceed;
11488d12f40eSJeremy L Thompson 
11498d12f40eSJeremy L Thompson         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1150c9192acaSJeremy L Thompson         is_all_tensor    = is_all_tensor && is_tensor;
1151c9192acaSJeremy L Thompson         is_all_nontensor = is_all_nontensor && !is_tensor;
11528d12f40eSJeremy L Thompson         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
11538d12f40eSJeremy L Thompson         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
11548d12f40eSJeremy L Thompson         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1155c9192acaSJeremy L Thompson         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
11568d12f40eSJeremy L Thompson         CeedCallBackend(CeedFree(&resource_root));
11578d12f40eSJeremy L Thompson         CeedCallBackend(CeedDestroy(&basis_ceed));
11588d12f40eSJeremy L Thompson       }
11598d12f40eSJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis));
11608d12f40eSJeremy L Thompson     }
11618d12f40eSJeremy L Thompson 
11628d12f40eSJeremy L Thompson     for (CeedInt i = 0; i < num_output_fields; i++) {
11638d12f40eSJeremy L Thompson       CeedBasis basis;
11648d12f40eSJeremy L Thompson 
11658d12f40eSJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
11668d12f40eSJeremy L Thompson       if (basis != CEED_BASIS_NONE) {
11678d12f40eSJeremy L Thompson         bool        is_tensor = true;
11688d12f40eSJeremy L Thompson         const char *resource;
11698d12f40eSJeremy L Thompson         char       *resource_root;
11708d12f40eSJeremy L Thompson         Ceed        basis_ceed;
11718d12f40eSJeremy L Thompson 
11728d12f40eSJeremy L Thompson         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
1173c9192acaSJeremy L Thompson         is_all_tensor    = is_all_tensor && is_tensor;
1174c9192acaSJeremy L Thompson         is_all_nontensor = is_all_nontensor && !is_tensor;
11758d12f40eSJeremy L Thompson 
11768d12f40eSJeremy L Thompson         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
11778d12f40eSJeremy L Thompson         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
11788d12f40eSJeremy L Thompson         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
1179c9192acaSJeremy L Thompson         has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
11808d12f40eSJeremy L Thompson         CeedCallBackend(CeedFree(&resource_root));
11818d12f40eSJeremy L Thompson         CeedCallBackend(CeedDestroy(&basis_ceed));
11828d12f40eSJeremy L Thompson       }
11838d12f40eSJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis));
11848d12f40eSJeremy L Thompson     }
11858d12f40eSJeremy L Thompson     // -- Fallback to ref if not all bases are shared
118674398b5aSJeremy L Thompson     if (!has_shared_bases) {
11878d12f40eSJeremy L Thompson       *is_good_build = false;
11888d12f40eSJeremy L Thompson       return CEED_ERROR_SUCCESS;
11898d12f40eSJeremy L Thompson     }
11908d12f40eSJeremy L Thompson   }
1191b7453713SJeremy L Thompson   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1192b7453713SJeremy L Thompson   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1193b7453713SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1194b7453713SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
11957d8d0e25Snbeams 
11964b3e95d5SJeremy L Thompson   // Get operator data
11973a2968d6SJeremy L Thompson   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
119874398b5aSJeremy L Thompson   {
1199efa41df3SJeremy L Thompson     CeedInt max_P = 0, max_P_1d = 0;
120074398b5aSJeremy L Thompson 
12014b3e95d5SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelData_Hip_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
120274398b5aSJeremy L Thompson                                                         qf_output_fields, &max_P, &max_P_1d, &Q, &Q_1d, &max_dim, &is_all_tensor, &use_3d_slices));
120374398b5aSJeremy L Thompson     data->max_P_1d = is_all_tensor ? max_P_1d : max_P;
120474398b5aSJeremy L Thompson   }
120574398b5aSJeremy L Thompson   if (max_dim == 0) max_dim = 1;
120674398b5aSJeremy L Thompson   data->dim = max_dim;
12073a2968d6SJeremy L Thompson   if (is_at_points) {
12083a2968d6SJeremy L Thompson     CeedElemRestriction_Hip *rstr_data;
12093a2968d6SJeremy L Thompson     CeedElemRestriction      rstr_points = NULL;
12104b3e95d5SJeremy L Thompson 
12113a2968d6SJeremy L Thompson     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
12123a2968d6SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
12133a2968d6SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
12143a2968d6SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetData(rstr_points, &rstr_data));
12153a2968d6SJeremy L Thompson     data->points.indices = (CeedInt *)rstr_data->d_offsets;
12163a2968d6SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
12173a2968d6SJeremy L Thompson   }
12183a2968d6SJeremy L Thompson   if (is_at_points) use_3d_slices = false;
12193a2968d6SJeremy L Thompson   if (Q_1d == 0) {
12203a2968d6SJeremy L Thompson     if (is_at_points) Q_1d = max_num_points;
12213a2968d6SJeremy L Thompson     else CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
12224b3e95d5SJeremy L Thompson   }
122374398b5aSJeremy L Thompson   if (Q == 0) Q = Q_1d;
122474398b5aSJeremy L Thompson   data->Q    = Q;
12254b3e95d5SJeremy L Thompson   data->Q_1d = Q_1d;
12264b3e95d5SJeremy L Thompson 
12270b454692Sjeremylt   // Check for restriction only identity operator
12284b3e95d5SJeremy L Thompson   {
12294b3e95d5SJeremy L Thompson     bool is_identity_qf;
12304b3e95d5SJeremy L Thompson 
12312b730f8bSJeremy L Thompson     CeedCallBackend(CeedQFunctionIsIdentity(qf, &is_identity_qf));
12320b454692Sjeremylt     if (is_identity_qf) {
12339e201c85SYohann       CeedEvalMode eval_mode_in, eval_mode_out;
1234b7453713SJeremy L Thompson 
12352b730f8bSJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[0], &eval_mode_in));
12362b730f8bSJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[0], &eval_mode_out));
12376574a04fSJeremy L Thompson       CeedCheck(eval_mode_in != CEED_EVAL_NONE || eval_mode_out != CEED_EVAL_NONE, ceed, CEED_ERROR_BACKEND,
12386574a04fSJeremy L Thompson                 "Backend does not implement restriction only identity operators");
12390b454692Sjeremylt     }
12404b3e95d5SJeremy L Thompson   }
1241b2165e7aSSebastian Grimberg 
1242b2165e7aSSebastian Grimberg   // Load basis source files
1243eaf9ad10SZach Atkins   if (!is_all_nontensor) {
1244*0183ed61SJeremy L Thompson     code << tab << "// Tensor basis source\n";
1245*0183ed61SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
124674398b5aSJeremy L Thompson   }
124774398b5aSJeremy L Thompson   if (!is_all_tensor) {
1248*0183ed61SJeremy L Thompson     code << tab << "// Non-tensor basis source\n";
1249*0183ed61SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-nontensor-templates.h>\n\n";
12509123fb08SJeremy L Thompson   }
12519123fb08SJeremy L Thompson   if (is_at_points) {
1252*0183ed61SJeremy L Thompson     code << tab << "// AtPoints basis source\n";
1253*0183ed61SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
12549123fb08SJeremy L Thompson   }
125574398b5aSJeremy L Thompson   if (!is_all_tensor && !is_all_nontensor) {
1256*0183ed61SJeremy L Thompson     code << tab << "// Tensor basis source\n";
1257*0183ed61SJeremy L Thompson     code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-flattened-templates.h>\n\n";
125874398b5aSJeremy L Thompson   }
1259*0183ed61SJeremy L Thompson   code << tab << "// CodeGen operator source\n";
1260*0183ed61SJeremy L Thompson   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
12617d8d0e25Snbeams 
12624b3e95d5SJeremy L Thompson   // Get QFunction name
12634b3e95d5SJeremy L Thompson   std::string qfunction_name(qf_data->qfunction_name);
12644b3e95d5SJeremy L Thompson   std::string operator_name;
12654b3e95d5SJeremy L Thompson 
126609095acaSJeremy L Thompson   operator_name = "CeedKernelHipGenOperator_" + qfunction_name;
12677d8d0e25Snbeams 
12689e201c85SYohann   // Define CEED_Q_VLA
1269*0183ed61SJeremy L Thompson   code << "\n" << tab << "#undef CEED_Q_VLA\n";
127074398b5aSJeremy L Thompson   if (max_dim != 3 || is_at_points || use_3d_slices || !is_all_tensor) {
1271*0183ed61SJeremy L Thompson     code << tab << "#define CEED_Q_VLA 1\n\n";
12729e201c85SYohann   } else {
1273*0183ed61SJeremy L Thompson     code << tab << "#define CEED_Q_VLA " << Q_1d << "\n\n";
12749e201c85SYohann   }
12759e201c85SYohann 
12764b3e95d5SJeremy L Thompson   // Add user QFunction source
12774b3e95d5SJeremy L Thompson   {
12789c25dd66SJeremy L Thompson     const char *source_path;
12794b3e95d5SJeremy L Thompson 
12809c25dd66SJeremy L Thompson     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
12819c25dd66SJeremy L Thompson     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
12829c25dd66SJeremy L Thompson 
1283*0183ed61SJeremy L Thompson     code << tab << "// User QFunction source\n";
1284*0183ed61SJeremy L Thompson     code << tab << "#include \"" << source_path << "\"\n\n";
12854b3e95d5SJeremy L Thompson   }
12867d8d0e25Snbeams 
12877d8d0e25Snbeams   // Setup
1288*0183ed61SJeremy L Thompson   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1289*0183ed61SJeremy L Thompson   code << tab << "// Operator Kernel\n";
1290*0183ed61SJeremy L Thompson   code << tab << "// \n";
1291*0183ed61SJeremy L Thompson   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1292*0183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1293*0183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1294*0183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1295*0183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1296*0183ed61SJeremy L Thompson   code << tab << "// \n";
1297*0183ed61SJeremy L Thompson   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1298*0183ed61SJeremy L Thompson   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1299*0183ed61SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n";
1300*0183ed61SJeremy L Thompson   code << tab << "extern \"C\" __launch_bounds__(BLOCK_SIZE)\n";
13012b730f8bSJeremy L Thompson   code << "__global__ void " << operator_name
13023a2968d6SJeremy L Thompson        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar* W, Points_Hip points) {\n";
1303*0183ed61SJeremy L Thompson   tab.push();
13044b3e95d5SJeremy L Thompson 
13054b3e95d5SJeremy L Thompson   // Scratch buffers
13069e201c85SYohann   for (CeedInt i = 0; i < num_input_fields; i++) {
13074b3e95d5SJeremy L Thompson     CeedEvalMode eval_mode;
13084b3e95d5SJeremy L Thompson 
13092b730f8bSJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
13109e201c85SYohann     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1311*0183ed61SJeremy L Thompson       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
13127d8d0e25Snbeams     }
13137d8d0e25Snbeams   }
13149e201c85SYohann   for (CeedInt i = 0; i < num_output_fields; i++) {
1315*0183ed61SJeremy L Thompson     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
13167d8d0e25Snbeams   }
13177d8d0e25Snbeams 
1318*0183ed61SJeremy L Thompson   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
131974398b5aSJeremy L Thompson   if (!is_all_tensor) {
1320*0183ed61SJeremy L Thompson     code << tab << "const CeedInt Q = " << Q << ";\n";
132174398b5aSJeremy L Thompson   }
132274398b5aSJeremy L Thompson   if (!is_all_nontensor) {
1323*0183ed61SJeremy L Thompson     code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
132474398b5aSJeremy L Thompson   }
13253a2968d6SJeremy L Thompson   if (is_at_points) {
1326*0183ed61SJeremy L Thompson     code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1327*0183ed61SJeremy L Thompson     code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
13283a2968d6SJeremy L Thompson   }
13297d8d0e25Snbeams 
13304b3e95d5SJeremy L Thompson   // Shared data
1331*0183ed61SJeremy L Thompson   code << tab << "extern __shared__ CeedScalar slice[];\n";
1332*0183ed61SJeremy L Thompson   code << tab << "SharedData_Hip data;\n";
1333*0183ed61SJeremy L Thompson   code << tab << "data.t_id_x = threadIdx.x;\n";
1334*0183ed61SJeremy L Thompson   code << tab << "data.t_id_y = threadIdx.y;\n";
1335*0183ed61SJeremy L Thompson   code << tab << "data.t_id_z = threadIdx.z;\n";
1336*0183ed61SJeremy L Thompson   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1337*0183ed61SJeremy L Thompson   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
13387d8d0e25Snbeams 
13399ee499e5SJeremy L Thompson   // -- Determine input mat reuse
134045a787f7SJeremy L Thompson   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
13419ee499e5SJeremy L Thompson 
13429ee499e5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
134345a787f7SJeremy L Thompson     input_matrix_reuse[i].index = -1;
13449ee499e5SJeremy L Thompson   }
13459ee499e5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
134674398b5aSJeremy L Thompson     bool         is_tensor = true;
13479ee499e5SJeremy L Thompson     CeedEvalMode eval_mode_i;
13489ee499e5SJeremy L Thompson     CeedBasis    basis_i;
13499ee499e5SJeremy L Thompson 
13509ee499e5SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
13519ee499e5SJeremy L Thompson     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
13529ee499e5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
135374398b5aSJeremy L Thompson     CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
135445a787f7SJeremy L Thompson     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
13559ee499e5SJeremy L Thompson       CeedEvalMode eval_mode_j;
13569ee499e5SJeremy L Thompson       CeedBasis    basis_j;
13579ee499e5SJeremy L Thompson 
13589ee499e5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
13599ee499e5SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
13609ee499e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
13619ee499e5SJeremy L Thompson       if (basis_i == basis_j) {
13629ee499e5SJeremy L Thompson         if (is_tensor) {
136345a787f7SJeremy L Thompson           input_matrix_reuse[i].index     = j;
136445a787f7SJeremy L Thompson           input_matrix_reuse[i].is_input  = true;
136545a787f7SJeremy L Thompson           input_matrix_reuse[i].eval_mode = eval_mode_j;
13669ee499e5SJeremy L Thompson         } else {
13679ee499e5SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
13689ee499e5SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
136945a787f7SJeremy L Thompson             input_matrix_reuse[i].index     = j;
137045a787f7SJeremy L Thompson             input_matrix_reuse[i].is_input  = true;
137145a787f7SJeremy L Thompson             input_matrix_reuse[i].eval_mode = eval_mode_j;
13729ee499e5SJeremy L Thompson           }
13739ee499e5SJeremy L Thompson         }
13749ee499e5SJeremy L Thompson       }
13759ee499e5SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
13769ee499e5SJeremy L Thompson     }
13779ee499e5SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
13789ee499e5SJeremy L Thompson   }
13799ee499e5SJeremy L Thompson 
13809ee499e5SJeremy L Thompson   // -- Determine output mat reuse
138145a787f7SJeremy L Thompson   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
13829ee499e5SJeremy L Thompson 
13839ee499e5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
138445a787f7SJeremy L Thompson     output_matrix_reuse[i].index = -1;
13859ee499e5SJeremy L Thompson   }
13869ee499e5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
138774398b5aSJeremy L Thompson     bool         is_tensor = true;
13889ee499e5SJeremy L Thompson     CeedEvalMode eval_mode_i;
13899ee499e5SJeremy L Thompson     CeedBasis    basis_i;
13909ee499e5SJeremy L Thompson 
13919ee499e5SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
13929ee499e5SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
139345a787f7SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
13949ee499e5SJeremy L Thompson       CeedEvalMode eval_mode_j;
13959ee499e5SJeremy L Thompson       CeedBasis    basis_j;
13969ee499e5SJeremy L Thompson 
13979ee499e5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
13989ee499e5SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
13999ee499e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
14009ee499e5SJeremy L Thompson       if (basis_i == basis_j) {
14019ee499e5SJeremy L Thompson         if (is_tensor) {
140245a787f7SJeremy L Thompson           output_matrix_reuse[i].index     = j;
140345a787f7SJeremy L Thompson           output_matrix_reuse[i].is_input  = true;
140445a787f7SJeremy L Thompson           output_matrix_reuse[i].eval_mode = eval_mode_j;
14059ee499e5SJeremy L Thompson         } else {
14069ee499e5SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
14079ee499e5SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
140845a787f7SJeremy L Thompson             output_matrix_reuse[i].index     = j;
140945a787f7SJeremy L Thompson             output_matrix_reuse[i].is_input  = true;
141045a787f7SJeremy L Thompson             output_matrix_reuse[i].eval_mode = eval_mode_j;
14119ee499e5SJeremy L Thompson           }
14129ee499e5SJeremy L Thompson         }
14139ee499e5SJeremy L Thompson       }
14149ee499e5SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
14159ee499e5SJeremy L Thompson     }
141645a787f7SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
14179ee499e5SJeremy L Thompson       CeedEvalMode eval_mode_j;
14189ee499e5SJeremy L Thompson       CeedBasis    basis_j;
14199ee499e5SJeremy L Thompson 
14209ee499e5SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
14219ee499e5SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
14229ee499e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
142374398b5aSJeremy L Thompson       CeedCallBackend(CeedBasisIsTensor(basis_i, &is_tensor));
14249ee499e5SJeremy L Thompson       if (basis_i == basis_j) {
14259ee499e5SJeremy L Thompson         if (is_tensor) {
142645a787f7SJeremy L Thompson           output_matrix_reuse[i].index     = j;
142745a787f7SJeremy L Thompson           output_matrix_reuse[i].is_input  = false;
142845a787f7SJeremy L Thompson           output_matrix_reuse[i].eval_mode = eval_mode_j;
14299ee499e5SJeremy L Thompson         } else {
14309ee499e5SJeremy L Thompson           // For non-tensor can only re-use with the same eval mode
14319ee499e5SJeremy L Thompson           if (eval_mode_i == eval_mode_j) {
143245a787f7SJeremy L Thompson             output_matrix_reuse[i].index     = j;
143345a787f7SJeremy L Thompson             output_matrix_reuse[i].is_input  = false;
143445a787f7SJeremy L Thompson             output_matrix_reuse[i].eval_mode = eval_mode_j;
14359ee499e5SJeremy L Thompson           }
14369ee499e5SJeremy L Thompson         }
14379ee499e5SJeremy L Thompson       }
14389ee499e5SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
14399ee499e5SJeremy L Thompson     }
14409ee499e5SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
14419ee499e5SJeremy L Thompson   }
14429ee499e5SJeremy L Thompson 
14437d8d0e25Snbeams   // Initialize constants, and matrices B and G
1444*0183ed61SJeremy L Thompson   code << "\n" << tab << "// Input field constants and basis data\n";
14459e201c85SYohann   for (CeedInt i = 0; i < num_input_fields; i++) {
1446*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1447*0183ed61SJeremy L Thompson                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
14487d8d0e25Snbeams   }
1449*0183ed61SJeremy L Thompson   code << "\n" << tab << "// Output field constants and basis data\n";
14509e201c85SYohann   for (CeedInt i = 0; i < num_output_fields; i++) {
1451*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1452*0183ed61SJeremy L Thompson                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
14534b3e95d5SJeremy L Thompson   }
14547d8d0e25Snbeams 
14554b3e95d5SJeremy L Thompson   // Loop over all elements
1456*0183ed61SJeremy L Thompson   code << "\n" << tab << "// Element loop\n";
1457*0183ed61SJeremy L Thompson   code << tab << "__syncthreads();\n";
1458*0183ed61SJeremy L Thompson   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1459*0183ed61SJeremy L Thompson   tab.push();
14604b3e95d5SJeremy L Thompson 
1461e93651e5SJeremy L Thompson   // -- Compute minimum buffer space needed
14623a2968d6SJeremy L Thompson   CeedInt max_rstr_buffer_size = 1;
1463e93651e5SJeremy L Thompson 
1464e93651e5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
14656de40545SJeremy L Thompson     CeedEvalMode eval_mode;
14666de40545SJeremy L Thompson 
14676de40545SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
14686de40545SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1469a61b1c91SJeremy L Thompson       CeedInt             num_comp;
1470e93651e5SJeremy L Thompson       CeedElemRestriction elem_rstr;
1471e93651e5SJeremy L Thompson 
1472e93651e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1473e93651e5SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1474a61b1c91SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1475681d0ea7SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1476e93651e5SJeremy L Thompson     }
14776de40545SJeremy L Thompson   }
1478e93651e5SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
14796de40545SJeremy L Thompson     CeedEvalMode eval_mode;
14806de40545SJeremy L Thompson 
14816de40545SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
14826de40545SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE) {
1483a61b1c91SJeremy L Thompson       CeedInt             num_comp;
1484e93651e5SJeremy L Thompson       CeedElemRestriction elem_rstr;
1485e93651e5SJeremy L Thompson 
1486e93651e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1487e93651e5SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1488a61b1c91SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1489681d0ea7SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1490e93651e5SJeremy L Thompson     }
14916de40545SJeremy L Thompson   }
1492*0183ed61SJeremy L Thompson   code << tab << "// Scratch restriction buffer space\n";
1493*0183ed61SJeremy L Thompson   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1494e93651e5SJeremy L Thompson 
1495e93651e5SJeremy L Thompson   // -- Determine best input field processing order
1496e93651e5SJeremy L Thompson   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1497e93651e5SJeremy L Thompson 
1498e93651e5SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
1499e93651e5SJeremy L Thompson     field_rstr_in_buffer[i] = -1;
1500e93651e5SJeremy L Thompson     input_field_order[i]    = -1;
1501e93651e5SJeremy L Thompson   }
1502e93651e5SJeremy L Thompson   {
1503e93651e5SJeremy L Thompson     bool    is_ordered[CEED_FIELD_MAX];
1504e93651e5SJeremy L Thompson     CeedInt curr_index = 0;
1505e93651e5SJeremy L Thompson 
1506e93651e5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1507e93651e5SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
1508e93651e5SJeremy L Thompson       CeedVector          vec_i;
1509e93651e5SJeremy L Thompson       CeedElemRestriction rstr_i;
1510e93651e5SJeremy L Thompson 
1511e93651e5SJeremy L Thompson       if (is_ordered[i]) continue;
1512e93651e5SJeremy L Thompson       field_rstr_in_buffer[i]       = i;
1513e93651e5SJeremy L Thompson       is_ordered[i]                 = true;
1514e93651e5SJeremy L Thompson       input_field_order[curr_index] = i;
1515e93651e5SJeremy L Thompson       curr_index++;
1516034f99fdSJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1517e93651e5SJeremy L Thompson       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1518e93651e5SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1519e93651e5SJeremy L Thompson       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1520e93651e5SJeremy L Thompson         CeedVector          vec_j;
1521e93651e5SJeremy L Thompson         CeedElemRestriction rstr_j;
1522e93651e5SJeremy L Thompson 
1523e93651e5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1524e93651e5SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1525e93651e5SJeremy L Thompson         if (rstr_i == rstr_j && vec_i == vec_j) {
1526e93651e5SJeremy L Thompson           field_rstr_in_buffer[j]       = i;
1527e93651e5SJeremy L Thompson           is_ordered[j]                 = true;
1528e93651e5SJeremy L Thompson           input_field_order[curr_index] = j;
1529e93651e5SJeremy L Thompson           curr_index++;
1530e93651e5SJeremy L Thompson         }
15313a2968d6SJeremy L Thompson         CeedCallBackend(CeedVectorDestroy(&vec_j));
15323a2968d6SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1533e93651e5SJeremy L Thompson       }
15343a2968d6SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec_i));
15353a2968d6SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1536e93651e5SJeremy L Thompson     }
1537e93651e5SJeremy L Thompson   }
1538e93651e5SJeremy L Thompson 
15394b3e95d5SJeremy L Thompson   // -- Input restriction and basis
1540*0183ed61SJeremy L Thompson   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
15419e201c85SYohann   for (CeedInt i = 0; i < num_input_fields; i++) {
154259fa3f92SJeremy L Thompson     const char   *field_name;
154359fa3f92SJeremy L Thompson     const CeedInt f = input_field_order[i];
1544e93651e5SJeremy L Thompson 
154559fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1546*0183ed61SJeremy L Thompson     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
15477d8d0e25Snbeams 
15484b3e95d5SJeremy L Thompson     // ---- Restriction
1549*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1550*0183ed61SJeremy L Thompson                                                                max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1551b7453713SJeremy L Thompson 
15524b3e95d5SJeremy L Thompson     // ---- Basis action
1553*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1554*0183ed61SJeremy L Thompson                                                          is_all_tensor, is_at_points, use_3d_slices));
15557d8d0e25Snbeams   }
15567d8d0e25Snbeams 
15574b3e95d5SJeremy L Thompson   // -- Q function
1558*0183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1559*0183ed61SJeremy L Thompson                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1560*0183ed61SJeremy L Thompson                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
15617d8d0e25Snbeams 
15624b3e95d5SJeremy L Thompson   // -- Output basis and restriction
1563*0183ed61SJeremy L Thompson   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
15649e201c85SYohann   for (CeedInt i = 0; i < num_output_fields; i++) {
156559fa3f92SJeremy L Thompson     const char *field_name;
156659fa3f92SJeremy L Thompson 
156759fa3f92SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1568*0183ed61SJeremy L Thompson     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1569b7453713SJeremy L Thompson 
15704b3e95d5SJeremy L Thompson     // ---- Basis action
1571*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
1572*0183ed61SJeremy L Thompson                                                          is_all_tensor, is_at_points, use_3d_slices));
15737d8d0e25Snbeams 
15744b3e95d5SJeremy L Thompson     // ---- Restriction
1575*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, i, NULL, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d,
1576*0183ed61SJeremy L Thompson                                                                false, is_all_tensor, is_at_points, use_3d_slices));
15777d8d0e25Snbeams   }
15787d8d0e25Snbeams 
15794b3e95d5SJeremy L Thompson   // Close loop and function
1580*0183ed61SJeremy L Thompson   tab.pop();
1581*0183ed61SJeremy L Thompson   code << tab << "}\n";
1582*0183ed61SJeremy L Thompson   tab.pop();
1583*0183ed61SJeremy L Thompson   code << tab << "}\n";
1584*0183ed61SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n\n";
15857d8d0e25Snbeams 
1586539ec17dSJeremy L Thompson   CeedInt block_sizes[3] = {0, 0, 0};
15879e201c85SYohann   CeedInt num_elem;
1588b7453713SJeremy L Thompson 
15893a2968d6SJeremy L Thompson   // Compile
15902b730f8bSJeremy L Thompson   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
159174398b5aSJeremy L Thompson   CeedCallBackend(BlockGridCalculate_Hip_gen(is_all_tensor ? max_dim : 1, num_elem, data->max_P_1d, is_all_tensor ? Q_1d : Q, block_sizes));
159290c30374SJeremy L Thompson   if (is_at_points) block_sizes[2] = 1;
15938d12f40eSJeremy L Thompson   {
15948d12f40eSJeremy L Thompson     bool is_compile_good = false;
15958d12f40eSJeremy L Thompson 
1596a61b1c91SJeremy L Thompson     data->thread_1d = block_sizes[0];
15976b92dc4bSJeremy L Thompson     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "OP_T_1D", block_sizes[0], "BLOCK_SIZE",
15982b730f8bSJeremy L Thompson                                        block_sizes[0] * block_sizes[1] * block_sizes[2]));
15998d12f40eSJeremy L Thompson     if (is_compile_good) {
16008d12f40eSJeremy L Thompson       *is_good_build = true;
1601eb7e6cafSJeremy L Thompson       CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
16028d12f40eSJeremy L Thompson     } else {
16038d12f40eSJeremy L Thompson       *is_good_build     = false;
16048d12f40eSJeremy L Thompson       data->use_fallback = true;
16058d12f40eSJeremy L Thompson     }
16068d12f40eSJeremy L Thompson   }
16072b730f8bSJeremy L Thompson   CeedCallBackend(CeedOperatorSetSetupDone(op));
16089bc66399SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
1609c11e12f4SJeremy L Thompson   CeedCallBackend(CeedQFunctionDestroy(&qf));
1610e15f9bd0SJeremy L Thompson   return CEED_ERROR_SUCCESS;
16117d8d0e25Snbeams }
16122a86cc9dSSebastian Grimberg 
16137d8d0e25Snbeams //------------------------------------------------------------------------------
1614*0183ed61SJeremy L Thompson // Build AtPoints assembly operator kernel
1615*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------
1616*0183ed61SJeremy L Thompson static int CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(CeedOperator op, bool is_full, bool *is_good_build) {
1617*0183ed61SJeremy L Thompson   bool                   is_all_tensor = true, is_at_points = false, use_3d_slices = false;
1618*0183ed61SJeremy L Thompson   Ceed                   ceed;
1619*0183ed61SJeremy L Thompson   CeedInt                Q, Q_1d, num_input_fields, num_output_fields, max_dim = 1, max_num_points = 0, coords_comp_stride = 0;
1620*0183ed61SJeremy L Thompson   CeedQFunctionField    *qf_input_fields, *qf_output_fields;
1621*0183ed61SJeremy L Thompson   CeedQFunction_Hip_gen *qf_data;
1622*0183ed61SJeremy L Thompson   CeedQFunction          qf;
1623*0183ed61SJeremy L Thompson   CeedOperatorField     *op_input_fields, *op_output_fields;
1624*0183ed61SJeremy L Thompson   CeedOperator_Hip_gen  *data;
1625*0183ed61SJeremy L Thompson   std::ostringstream     code;
1626*0183ed61SJeremy L Thompson   Tab                    tab;
1627*0183ed61SJeremy L Thompson 
1628*0183ed61SJeremy L Thompson   // Check compatibility
1629*0183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
1630*0183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
1631*0183ed61SJeremy L Thompson   CeedCheck(is_at_points, ceed, CEED_ERROR_BACKEND, "Only AtPoints operator assembly supported");
1632*0183ed61SJeremy L Thompson 
1633*0183ed61SJeremy L Thompson   // Retrieve operator data
1634*0183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetData(op, &data));
1635*0183ed61SJeremy L Thompson   Q       = data->Q;
1636*0183ed61SJeremy L Thompson   Q_1d    = data->Q_1d;
1637*0183ed61SJeremy L Thompson   max_dim = data->dim;
1638*0183ed61SJeremy L Thompson   {
1639*0183ed61SJeremy L Thompson     CeedElemRestriction rstr_points = NULL;
1640*0183ed61SJeremy L Thompson 
1641*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
1642*0183ed61SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
1643*0183ed61SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionGetCompStride(rstr_points, &coords_comp_stride));
1644*0183ed61SJeremy L Thompson     CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
1645*0183ed61SJeremy L Thompson   }
1646*0183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
1647*0183ed61SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
1648*0183ed61SJeremy L Thompson   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
1649*0183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
1650*0183ed61SJeremy L Thompson 
1651*0183ed61SJeremy L Thompson   // Load basis source files
1652*0183ed61SJeremy L Thompson   code << tab << "// Tensor basis source\n";
1653*0183ed61SJeremy L Thompson   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-templates.h>\n\n";
1654*0183ed61SJeremy L Thompson   code << tab << "// AtPoints basis source\n";
1655*0183ed61SJeremy L Thompson   code << tab << "#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h>\n\n";
1656*0183ed61SJeremy L Thompson   code << tab << "// CodeGen operator source\n";
1657*0183ed61SJeremy L Thompson   code << tab << "#include <ceed/jit-source/hip/hip-gen-templates.h>\n\n";
1658*0183ed61SJeremy L Thompson 
1659*0183ed61SJeremy L Thompson   // Get QFunction name
1660*0183ed61SJeremy L Thompson   std::string qfunction_name(qf_data->qfunction_name);
1661*0183ed61SJeremy L Thompson   std::string operator_name;
1662*0183ed61SJeremy L Thompson 
1663*0183ed61SJeremy L Thompson   if (is_full) {
1664*0183ed61SJeremy L Thompson     operator_name = "CeedKernelHipGenOperatorFullAssembly_" + qfunction_name;
1665*0183ed61SJeremy L Thompson   } else {
1666*0183ed61SJeremy L Thompson     operator_name = "CeedKernelHipGenOperatorDiagonalAssembly_" + qfunction_name;
1667*0183ed61SJeremy L Thompson   }
1668*0183ed61SJeremy L Thompson 
1669*0183ed61SJeremy L Thompson   // Define CEED_Q_VLA
1670*0183ed61SJeremy L Thompson   code << "\n" << tab << "#undef CEED_Q_VLA\n";
1671*0183ed61SJeremy L Thompson   code << tab << "#define CEED_Q_VLA 1\n\n";
1672*0183ed61SJeremy L Thompson 
1673*0183ed61SJeremy L Thompson   // Add user QFunction source
1674*0183ed61SJeremy L Thompson   {
1675*0183ed61SJeremy L Thompson     const char *source_path;
1676*0183ed61SJeremy L Thompson 
1677*0183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionGetSourcePath(qf, &source_path));
1678*0183ed61SJeremy L Thompson     CeedCheck(source_path, ceed, CEED_ERROR_UNSUPPORTED, "/gpu/hip/gen backend requires QFunction source code file");
1679*0183ed61SJeremy L Thompson 
1680*0183ed61SJeremy L Thompson     code << tab << "// User QFunction source\n";
1681*0183ed61SJeremy L Thompson     code << tab << "#include \"" << source_path << "\"\n\n";
1682*0183ed61SJeremy L Thompson   }
1683*0183ed61SJeremy L Thompson 
1684*0183ed61SJeremy L Thompson   // Setup
1685*0183ed61SJeremy L Thompson   code << "\n" << tab << "// -----------------------------------------------------------------------------\n";
1686*0183ed61SJeremy L Thompson   code << tab << "// Operator Assembly Kernel\n";
1687*0183ed61SJeremy L Thompson   code << tab << "// \n";
1688*0183ed61SJeremy L Thompson   code << tab << "// d_[in,out]_i:   CeedVector device array\n";
1689*0183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_e_i: Element vector register\n";
1690*0183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_q_i: Quadrature space vector register\n";
1691*0183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_c_i: AtPoints Chebyshev coefficients register\n";
1692*0183ed61SJeremy L Thompson   code << tab << "// r_[in,out]_s_i: Quadrature space slice vector register\n";
1693*0183ed61SJeremy L Thompson   code << tab << "// \n";
1694*0183ed61SJeremy L Thompson   code << tab << "// s_B_[in,out]_i: Interpolation matrix, shared memory\n";
1695*0183ed61SJeremy L Thompson   code << tab << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
1696*0183ed61SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n";
1697*0183ed61SJeremy L Thompson   code << tab << "extern \"C\" __global__ void " << operator_name
1698*0183ed61SJeremy L Thompson        << "(CeedInt num_elem, void* ctx, FieldsInt_Hip indices, Fields_Hip fields, Fields_Hip B, Fields_Hip G, CeedScalar *W, Points_Hip "
1699*0183ed61SJeremy L Thompson           "points, CeedScalar *__restrict__ values_array) {\n";
1700*0183ed61SJeremy L Thompson   tab.push();
1701*0183ed61SJeremy L Thompson 
1702*0183ed61SJeremy L Thompson   // Scratch buffers
1703*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
1704*0183ed61SJeremy L Thompson     CeedEvalMode eval_mode;
1705*0183ed61SJeremy L Thompson 
1706*0183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1707*0183ed61SJeremy L Thompson     if (eval_mode != CEED_EVAL_WEIGHT) {  // Skip CEED_EVAL_WEIGHT
1708*0183ed61SJeremy L Thompson       code << tab << "const CeedScalar *__restrict__ d_in_" << i << " = fields.inputs[" << i << "];\n";
1709*0183ed61SJeremy L Thompson     }
1710*0183ed61SJeremy L Thompson   }
1711*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
1712*0183ed61SJeremy L Thompson     code << tab << "CeedScalar *__restrict__ d_out_" << i << " = fields.outputs[" << i << "];\n";
1713*0183ed61SJeremy L Thompson   }
1714*0183ed61SJeremy L Thompson 
1715*0183ed61SJeremy L Thompson   code << tab << "const CeedInt max_dim = " << max_dim << ";\n";
1716*0183ed61SJeremy L Thompson   code << tab << "const CeedInt Q_1d = " << Q_1d << ";\n";
1717*0183ed61SJeremy L Thompson   code << tab << "const CeedInt max_num_points = " << max_num_points << ";\n";
1718*0183ed61SJeremy L Thompson   code << tab << "const CeedInt coords_comp_stride = " << coords_comp_stride << ";\n";
1719*0183ed61SJeremy L Thompson 
1720*0183ed61SJeremy L Thompson   // Shared data
1721*0183ed61SJeremy L Thompson   code << tab << "extern __shared__ CeedScalar slice[];\n";
1722*0183ed61SJeremy L Thompson   code << tab << "SharedData_Hip data;\n";
1723*0183ed61SJeremy L Thompson   code << tab << "data.t_id_x = threadIdx.x;\n";
1724*0183ed61SJeremy L Thompson   code << tab << "data.t_id_y = threadIdx.y;\n";
1725*0183ed61SJeremy L Thompson   code << tab << "data.t_id_z = threadIdx.z;\n";
1726*0183ed61SJeremy L Thompson   code << tab << "data.t_id   = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
1727*0183ed61SJeremy L Thompson   code << tab << "data.slice  = slice + data.t_id_z*OP_T_1D" << ((!is_all_tensor || max_dim == 1) ? "" : "*OP_T_1D") << ";\n";
1728*0183ed61SJeremy L Thompson 
1729*0183ed61SJeremy L Thompson   // -- Determine input mat reuse
1730*0183ed61SJeremy L Thompson   FieldReuse_Hip input_matrix_reuse[CEED_FIELD_MAX];
1731*0183ed61SJeremy L Thompson 
1732*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
1733*0183ed61SJeremy L Thompson     input_matrix_reuse[i].index = -1;
1734*0183ed61SJeremy L Thompson   }
1735*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
1736*0183ed61SJeremy L Thompson     CeedEvalMode eval_mode_i;
1737*0183ed61SJeremy L Thompson     CeedBasis    basis_i;
1738*0183ed61SJeremy L Thompson 
1739*0183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
1740*0183ed61SJeremy L Thompson     if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
1741*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
1742*0183ed61SJeremy L Thompson     for (CeedInt j = 0; (input_matrix_reuse[i].index == -1) && (j < i); j++) {
1743*0183ed61SJeremy L Thompson       CeedEvalMode eval_mode_j;
1744*0183ed61SJeremy L Thompson       CeedBasis    basis_j;
1745*0183ed61SJeremy L Thompson 
1746*0183ed61SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1747*0183ed61SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1748*0183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1749*0183ed61SJeremy L Thompson       if (basis_i == basis_j) {
1750*0183ed61SJeremy L Thompson         input_matrix_reuse[i].index     = j;
1751*0183ed61SJeremy L Thompson         input_matrix_reuse[i].is_input  = true;
1752*0183ed61SJeremy L Thompson         input_matrix_reuse[i].eval_mode = eval_mode_j;
1753*0183ed61SJeremy L Thompson       }
1754*0183ed61SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
1755*0183ed61SJeremy L Thompson     }
1756*0183ed61SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
1757*0183ed61SJeremy L Thompson   }
1758*0183ed61SJeremy L Thompson 
1759*0183ed61SJeremy L Thompson   // -- Determine output mat reuse
1760*0183ed61SJeremy L Thompson   FieldReuse_Hip output_matrix_reuse[CEED_FIELD_MAX];
1761*0183ed61SJeremy L Thompson 
1762*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
1763*0183ed61SJeremy L Thompson     output_matrix_reuse[i].index = -1;
1764*0183ed61SJeremy L Thompson   }
1765*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
1766*0183ed61SJeremy L Thompson     CeedEvalMode eval_mode_i;
1767*0183ed61SJeremy L Thompson     CeedBasis    basis_i;
1768*0183ed61SJeremy L Thompson 
1769*0183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
1770*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
1771*0183ed61SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < num_input_fields); j++) {
1772*0183ed61SJeremy L Thompson       CeedEvalMode eval_mode_j;
1773*0183ed61SJeremy L Thompson       CeedBasis    basis_j;
1774*0183ed61SJeremy L Thompson 
1775*0183ed61SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
1776*0183ed61SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1777*0183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
1778*0183ed61SJeremy L Thompson       if (basis_i == basis_j) {
1779*0183ed61SJeremy L Thompson         output_matrix_reuse[i].index     = j;
1780*0183ed61SJeremy L Thompson         output_matrix_reuse[i].is_input  = true;
1781*0183ed61SJeremy L Thompson         output_matrix_reuse[i].eval_mode = eval_mode_j;
1782*0183ed61SJeremy L Thompson       }
1783*0183ed61SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
1784*0183ed61SJeremy L Thompson     }
1785*0183ed61SJeremy L Thompson     for (CeedInt j = 0; (output_matrix_reuse[i].index == -1) && (j < i); j++) {
1786*0183ed61SJeremy L Thompson       CeedEvalMode eval_mode_j;
1787*0183ed61SJeremy L Thompson       CeedBasis    basis_j;
1788*0183ed61SJeremy L Thompson 
1789*0183ed61SJeremy L Thompson       CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
1790*0183ed61SJeremy L Thompson       if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
1791*0183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
1792*0183ed61SJeremy L Thompson       if (basis_i == basis_j) {
1793*0183ed61SJeremy L Thompson         output_matrix_reuse[i].index     = j;
1794*0183ed61SJeremy L Thompson         output_matrix_reuse[i].is_input  = false;
1795*0183ed61SJeremy L Thompson         output_matrix_reuse[i].eval_mode = eval_mode_j;
1796*0183ed61SJeremy L Thompson       }
1797*0183ed61SJeremy L Thompson       CeedCallBackend(CeedBasisDestroy(&basis_j));
1798*0183ed61SJeremy L Thompson     }
1799*0183ed61SJeremy L Thompson     CeedCallBackend(CeedBasisDestroy(&basis_i));
1800*0183ed61SJeremy L Thompson   }
1801*0183ed61SJeremy L Thompson 
1802*0183ed61SJeremy L Thompson   // Initialize constants, and matrices B and G
1803*0183ed61SJeremy L Thompson   code << "\n" << tab << "// Input field constants and basis data\n";
1804*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
1805*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i],
1806*0183ed61SJeremy L Thompson                                                              max_dim, Q, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1807*0183ed61SJeremy L Thompson   }
1808*0183ed61SJeremy L Thompson   code << "\n" << tab << "// Output field constants and basis data\n";
1809*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
1810*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelFieldData_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i],
1811*0183ed61SJeremy L Thompson                                                              max_dim, Q, Q_1d, false, is_all_tensor, is_at_points, use_3d_slices));
1812*0183ed61SJeremy L Thompson   }
1813*0183ed61SJeremy L Thompson 
1814*0183ed61SJeremy L Thompson   // Loop over all elements
1815*0183ed61SJeremy L Thompson   code << "\n" << tab << "// Element loop\n";
1816*0183ed61SJeremy L Thompson   code << tab << "__syncthreads();\n";
1817*0183ed61SJeremy L Thompson   code << tab << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";
1818*0183ed61SJeremy L Thompson   tab.push();
1819*0183ed61SJeremy L Thompson 
1820*0183ed61SJeremy L Thompson   // -- Compute minimum buffer space needed
1821*0183ed61SJeremy L Thompson   CeedInt max_rstr_buffer_size = 1;
1822*0183ed61SJeremy L Thompson 
1823*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
1824*0183ed61SJeremy L Thompson     CeedEvalMode eval_mode;
1825*0183ed61SJeremy L Thompson 
1826*0183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
1827*0183ed61SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE && eval_mode != CEED_EVAL_WEIGHT) {
1828*0183ed61SJeremy L Thompson       CeedInt             num_comp;
1829*0183ed61SJeremy L Thompson       CeedElemRestriction elem_rstr;
1830*0183ed61SJeremy L Thompson 
1831*0183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
1832*0183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1833*0183ed61SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1834*0183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1835*0183ed61SJeremy L Thompson     }
1836*0183ed61SJeremy L Thompson   }
1837*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
1838*0183ed61SJeremy L Thompson     CeedEvalMode eval_mode;
1839*0183ed61SJeremy L Thompson 
1840*0183ed61SJeremy L Thompson     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
1841*0183ed61SJeremy L Thompson     if (eval_mode != CEED_EVAL_NONE) {
1842*0183ed61SJeremy L Thompson       CeedInt             num_comp;
1843*0183ed61SJeremy L Thompson       CeedElemRestriction elem_rstr;
1844*0183ed61SJeremy L Thompson 
1845*0183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
1846*0183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
1847*0183ed61SJeremy L Thompson       max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * (is_all_tensor && (max_dim >= 3) ? Q_1d : 1));
1848*0183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
1849*0183ed61SJeremy L Thompson     }
1850*0183ed61SJeremy L Thompson   }
1851*0183ed61SJeremy L Thompson   code << tab << "// Scratch restriction buffer space\n";
1852*0183ed61SJeremy L Thompson   code << tab << "CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";
1853*0183ed61SJeremy L Thompson 
1854*0183ed61SJeremy L Thompson   // -- Determine best input field processing order
1855*0183ed61SJeremy L Thompson   CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];
1856*0183ed61SJeremy L Thompson 
1857*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
1858*0183ed61SJeremy L Thompson     field_rstr_in_buffer[i] = -1;
1859*0183ed61SJeremy L Thompson     input_field_order[i]    = -1;
1860*0183ed61SJeremy L Thompson   }
1861*0183ed61SJeremy L Thompson   {
1862*0183ed61SJeremy L Thompson     bool    is_ordered[CEED_FIELD_MAX];
1863*0183ed61SJeremy L Thompson     CeedInt curr_index = 0;
1864*0183ed61SJeremy L Thompson 
1865*0183ed61SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
1866*0183ed61SJeremy L Thompson     for (CeedInt i = 0; i < num_input_fields; i++) {
1867*0183ed61SJeremy L Thompson       CeedVector          vec_i;
1868*0183ed61SJeremy L Thompson       CeedElemRestriction rstr_i;
1869*0183ed61SJeremy L Thompson 
1870*0183ed61SJeremy L Thompson       if (is_ordered[i]) continue;
1871*0183ed61SJeremy L Thompson       field_rstr_in_buffer[i]       = i;
1872*0183ed61SJeremy L Thompson       is_ordered[i]                 = true;
1873*0183ed61SJeremy L Thompson       input_field_order[curr_index] = i;
1874*0183ed61SJeremy L Thompson       curr_index++;
1875*0183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
1876*0183ed61SJeremy L Thompson       if (vec_i == CEED_VECTOR_NONE) continue;  // CEED_EVAL_WEIGHT
1877*0183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
1878*0183ed61SJeremy L Thompson       for (CeedInt j = i + 1; j < num_input_fields; j++) {
1879*0183ed61SJeremy L Thompson         CeedVector          vec_j;
1880*0183ed61SJeremy L Thompson         CeedElemRestriction rstr_j;
1881*0183ed61SJeremy L Thompson 
1882*0183ed61SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
1883*0183ed61SJeremy L Thompson         CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
1884*0183ed61SJeremy L Thompson         if (rstr_i == rstr_j && vec_i == vec_j) {
1885*0183ed61SJeremy L Thompson           field_rstr_in_buffer[j]       = i;
1886*0183ed61SJeremy L Thompson           is_ordered[j]                 = true;
1887*0183ed61SJeremy L Thompson           input_field_order[curr_index] = j;
1888*0183ed61SJeremy L Thompson           curr_index++;
1889*0183ed61SJeremy L Thompson         }
1890*0183ed61SJeremy L Thompson         CeedCallBackend(CeedVectorDestroy(&vec_j));
1891*0183ed61SJeremy L Thompson         CeedCallBackend(CeedElemRestrictionDestroy(&rstr_j));
1892*0183ed61SJeremy L Thompson       }
1893*0183ed61SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec_i));
1894*0183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&rstr_i));
1895*0183ed61SJeremy L Thompson     }
1896*0183ed61SJeremy L Thompson   }
1897*0183ed61SJeremy L Thompson 
1898*0183ed61SJeremy L Thompson   // -- Input restriction and basis
1899*0183ed61SJeremy L Thompson   code << "\n" << tab << "// -- Input field restrictions and basis actions\n";
1900*0183ed61SJeremy L Thompson   CeedInt active_field_index = -1;
1901*0183ed61SJeremy L Thompson 
1902*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
1903*0183ed61SJeremy L Thompson     bool          is_active = false;
1904*0183ed61SJeremy L Thompson     const char   *field_name;
1905*0183ed61SJeremy L Thompson     const CeedInt f = input_field_order[i];
1906*0183ed61SJeremy L Thompson 
1907*0183ed61SJeremy L Thompson     {
1908*0183ed61SJeremy L Thompson       CeedVector vec;
1909*0183ed61SJeremy L Thompson 
1910*0183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1911*0183ed61SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
1912*0183ed61SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
1913*0183ed61SJeremy L Thompson     }
1914*0183ed61SJeremy L Thompson 
1915*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1916*0183ed61SJeremy L Thompson     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1917*0183ed61SJeremy L Thompson 
1918*0183ed61SJeremy L Thompson     if (is_active) {
1919*0183ed61SJeremy L Thompson       std::string var_suffix = "_in_" + std::to_string(f);
1920*0183ed61SJeremy L Thompson 
1921*0183ed61SJeremy L Thompson       code << tab << "// Active field - no restriction or basis action here\n";
1922*0183ed61SJeremy L Thompson       if (active_field_index == -1) {
1923*0183ed61SJeremy L Thompson         active_field_index = f;
1924*0183ed61SJeremy L Thompson         code << tab << "CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << (max_dim >= 3 ? "P_1d" + var_suffix : "1")
1925*0183ed61SJeremy L Thompson              << "] = {0.0};\n";
1926*0183ed61SJeremy L Thompson       } else {
1927*0183ed61SJeremy L Thompson         code << tab << "CeedScalar *r_e" << var_suffix << " = r_e_in_" << active_field_index << ";\n";
1928*0183ed61SJeremy L Thompson       }
1929*0183ed61SJeremy L Thompson     } else {
1930*0183ed61SJeremy L Thompson       // ---- Restriction
1931*0183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, tab, f, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
1932*0183ed61SJeremy L Thompson                                                                  max_dim, Q_1d, true, is_all_tensor, is_at_points, use_3d_slices));
1933*0183ed61SJeremy L Thompson 
1934*0183ed61SJeremy L Thompson       // ---- Basis action
1935*0183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1936*0183ed61SJeremy L Thompson                                                            is_all_tensor, is_at_points, use_3d_slices));
1937*0183ed61SJeremy L Thompson     }
1938*0183ed61SJeremy L Thompson   }
1939*0183ed61SJeremy L Thompson 
1940*0183ed61SJeremy L Thompson   // -- Loop over active field
1941*0183ed61SJeremy L Thompson   std::string active_var_suffix = "_in_" + std::to_string(active_field_index);
1942*0183ed61SJeremy L Thompson 
1943*0183ed61SJeremy L Thompson   code << "\n" << tab << "// Loop over nodes in active field\n";
1944*0183ed61SJeremy L Thompson   code << tab << "for (CeedInt n = 0; n < num_comp" << active_var_suffix << "*P_1d" << active_var_suffix
1945*0183ed61SJeremy L Thompson        << (max_dim > 1 ? "*P_1d" + active_var_suffix : "") << (max_dim > 2 ? "*P_1d" + active_var_suffix : "") << "; n++) {\n";
1946*0183ed61SJeremy L Thompson   tab.push();
1947*0183ed61SJeremy L Thompson 
1948*0183ed61SJeremy L Thompson   // -- Set current active node and component to 1
1949*0183ed61SJeremy L Thompson   code << tab << "// Set current active node and component to 1.0\n";
1950*0183ed61SJeremy L Thompson   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 1.0, r_e"
1951*0183ed61SJeremy L Thompson        << active_var_suffix << ");\n\n";
1952*0183ed61SJeremy L Thompson 
1953*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_input_fields; i++) {
1954*0183ed61SJeremy L Thompson     bool          is_active = false;
1955*0183ed61SJeremy L Thompson     const char   *field_name;
1956*0183ed61SJeremy L Thompson     const CeedInt f = input_field_order[i];
1957*0183ed61SJeremy L Thompson 
1958*0183ed61SJeremy L Thompson     {
1959*0183ed61SJeremy L Thompson       CeedVector vec;
1960*0183ed61SJeremy L Thompson 
1961*0183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[f], &vec));
1962*0183ed61SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
1963*0183ed61SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
1964*0183ed61SJeremy L Thompson     }
1965*0183ed61SJeremy L Thompson     if (!is_active) continue;
1966*0183ed61SJeremy L Thompson 
1967*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_input_fields[f], &field_name));
1968*0183ed61SJeremy L Thompson     code << tab << "// ---- Input field " << f << ": " << field_name << "\n";
1969*0183ed61SJeremy L Thompson 
1970*0183ed61SJeremy L Thompson     // ---- Basis action
1971*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, f, op_input_fields[f], qf_input_fields[f], max_dim, Q_1d, true,
1972*0183ed61SJeremy L Thompson                                                          is_all_tensor, is_at_points, use_3d_slices));
1973*0183ed61SJeremy L Thompson   }
1974*0183ed61SJeremy L Thompson 
1975*0183ed61SJeremy L Thompson   // -- Q function
1976*0183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorBuildKernelQFunction_Hip_gen(code, data, tab, max_dim, max_num_points, num_input_fields, op_input_fields,
1977*0183ed61SJeremy L Thompson                                                            qf_input_fields, num_output_fields, op_output_fields, qf_output_fields, qfunction_name,
1978*0183ed61SJeremy L Thompson                                                            Q_1d, is_all_tensor, is_at_points, use_3d_slices));
1979*0183ed61SJeremy L Thompson 
1980*0183ed61SJeremy L Thompson   // -- Output basis and restriction
1981*0183ed61SJeremy L Thompson   code << "\n" << tab << "// -- Output field basis action and restrictions\n";
1982*0183ed61SJeremy L Thompson   for (CeedInt i = 0; i < num_output_fields; i++) {
1983*0183ed61SJeremy L Thompson     bool        is_active = false;
1984*0183ed61SJeremy L Thompson     const char *field_name;
1985*0183ed61SJeremy L Thompson 
1986*0183ed61SJeremy L Thompson     {
1987*0183ed61SJeremy L Thompson       CeedVector vec;
1988*0183ed61SJeremy L Thompson 
1989*0183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetVector(op_output_fields[i], &vec));
1990*0183ed61SJeremy L Thompson       is_active = vec == CEED_VECTOR_ACTIVE;
1991*0183ed61SJeremy L Thompson       CeedCallBackend(CeedVectorDestroy(&vec));
1992*0183ed61SJeremy L Thompson     }
1993*0183ed61SJeremy L Thompson     if (!is_active) continue;
1994*0183ed61SJeremy L Thompson 
1995*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorFieldGetName(op_output_fields[i], &field_name));
1996*0183ed61SJeremy L Thompson     code << tab << "// ---- Output field " << i << ": " << field_name << "\n";
1997*0183ed61SJeremy L Thompson 
1998*0183ed61SJeremy L Thompson     // ---- Basis action
1999*0183ed61SJeremy L Thompson     CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, tab, i, op_output_fields[i], qf_output_fields[i], max_dim, Q_1d, false,
2000*0183ed61SJeremy L Thompson                                                          is_all_tensor, is_at_points, use_3d_slices));
2001*0183ed61SJeremy L Thompson 
2002*0183ed61SJeremy L Thompson     // ---- Restriction
2003*0183ed61SJeremy L Thompson     if (is_full) {
2004*0183ed61SJeremy L Thompson       // TODO: UPDATE OUTPUTS FOR FULL ASSEMBLY
2005*0183ed61SJeremy L Thompson     } else {
2006*0183ed61SJeremy L Thompson       std::string         var_suffix = "_out_" + std::to_string(i);
2007*0183ed61SJeremy L Thompson       CeedInt             comp_stride;
2008*0183ed61SJeremy L Thompson       CeedSize            l_size;
2009*0183ed61SJeremy L Thompson       CeedElemRestriction elem_rstr;
2010*0183ed61SJeremy L Thompson 
2011*0183ed61SJeremy L Thompson       CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
2012*0183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size));
2013*0183ed61SJeremy L Thompson       code << tab << "const CeedInt l_size" << var_suffix << " = " << l_size << ";\n";
2014*0183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride));
2015*0183ed61SJeremy L Thompson       code << tab << "const CeedInt comp_stride" << var_suffix << " = " << comp_stride << ";\n";
2016*0183ed61SJeremy L Thompson       code << tab << "WriteLVecStandard" << max_dim << "d_Single<num_comp" << var_suffix << ", comp_stride" << var_suffix << ", P_1d" + var_suffix
2017*0183ed61SJeremy L Thompson            << ">(data, l_size" << var_suffix << ", elem, n, indices.outputs[" << i << "], r_e" << var_suffix << ", values_array);\n";
2018*0183ed61SJeremy L Thompson       CeedCallBackend(CeedElemRestrictionDestroy(&elem_rstr));
2019*0183ed61SJeremy L Thompson     }
2020*0183ed61SJeremy L Thompson   }
2021*0183ed61SJeremy L Thompson 
2022*0183ed61SJeremy L Thompson   // -- Reset current active node and component
2023*0183ed61SJeremy L Thompson   code << "\n" << tab << "// Reset current active node and component to 0.0\n";
2024*0183ed61SJeremy L Thompson   code << tab << "SetEVecStandard" << max_dim << "d_Single<num_comp" << active_var_suffix << ", P_1d" << active_var_suffix << ">(data, n, 0.0, r_e"
2025*0183ed61SJeremy L Thompson        << active_var_suffix << ");\n";
2026*0183ed61SJeremy L Thompson 
2027*0183ed61SJeremy L Thompson   // -- End of loop over active field
2028*0183ed61SJeremy L Thompson   tab.pop();
2029*0183ed61SJeremy L Thompson   code << tab << "}\n";
2030*0183ed61SJeremy L Thompson 
2031*0183ed61SJeremy L Thompson   // Close loop and function
2032*0183ed61SJeremy L Thompson   tab.pop();
2033*0183ed61SJeremy L Thompson   code << tab << "}\n";
2034*0183ed61SJeremy L Thompson   tab.pop();
2035*0183ed61SJeremy L Thompson   code << tab << "}\n";
2036*0183ed61SJeremy L Thompson   code << tab << "// -----------------------------------------------------------------------------\n\n";
2037*0183ed61SJeremy L Thompson 
2038*0183ed61SJeremy L Thompson   CeedInt block_sizes[3] = {0, 0, 0};
2039*0183ed61SJeremy L Thompson   CeedInt num_elem;
2040*0183ed61SJeremy L Thompson 
2041*0183ed61SJeremy L Thompson   // Compile
2042*0183ed61SJeremy L Thompson   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
2043*0183ed61SJeremy L Thompson   CeedCallBackend(BlockGridCalculate_Hip_gen(max_dim, num_elem, data->max_P_1d, Q_1d, block_sizes));
2044*0183ed61SJeremy L Thompson   block_sizes[2] = 1;
2045*0183ed61SJeremy L Thompson   {
2046*0183ed61SJeremy L Thompson     bool is_compile_good = false;
2047*0183ed61SJeremy L Thompson 
2048*0183ed61SJeremy L Thompson     data->thread_1d = block_sizes[0];
2049*0183ed61SJeremy L Thompson     CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good,
2050*0183ed61SJeremy L Thompson                                        is_full ? &data->module_assemble_full : &data->module_assemble_diagonal, 2, "OP_T_1D", block_sizes[0],
2051*0183ed61SJeremy L Thompson                                        "BLOCK_SIZE", block_sizes[0] * block_sizes[1] * block_sizes[2]));
2052*0183ed61SJeremy L Thompson     if (is_compile_good) {
2053*0183ed61SJeremy L Thompson       *is_good_build = true;
2054*0183ed61SJeremy L Thompson       CeedCallBackend(CeedGetKernel_Hip(ceed, is_full ? data->module_assemble_full : data->module_assemble_diagonal, operator_name.c_str(),
2055*0183ed61SJeremy L Thompson                                         is_full ? &data->assemble_full : &data->assemble_diagonal));
2056*0183ed61SJeremy L Thompson     } else {
2057*0183ed61SJeremy L Thompson       *is_good_build              = false;
2058*0183ed61SJeremy L Thompson       data->use_assembly_fallback = true;
2059*0183ed61SJeremy L Thompson     }
2060*0183ed61SJeremy L Thompson   }
2061*0183ed61SJeremy L Thompson   CeedCallBackend(CeedDestroy(&ceed));
2062*0183ed61SJeremy L Thompson   CeedCallBackend(CeedQFunctionDestroy(&qf));
2063*0183ed61SJeremy L Thompson   return CEED_ERROR_SUCCESS;
2064*0183ed61SJeremy L Thompson }
2065*0183ed61SJeremy L Thompson 
2066*0183ed61SJeremy L Thompson extern "C" int CeedOperatorBuildKernelDiagonalAssemblyAtPoints_Hip_gen(CeedOperator op, bool *is_good_build) {
2067*0183ed61SJeremy L Thompson   return CeedOperatorBuildKernelAssemblyAtPoints_Hip_gen(op, false, is_good_build);
2068*0183ed61SJeremy L Thompson }
2069*0183ed61SJeremy L Thompson 
2070*0183ed61SJeremy L Thompson //------------------------------------------------------------------------------
2071