backends/cuda-gen/ceed-cuda-gen-operator-build.cpp

*241a4b83SYohann// Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
*241a4b83SYohann// Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
*241a4b83SYohann// All Rights reserved. See files LICENSE and NOTICE for details.
*241a4b83SYohann//
*241a4b83SYohann// This file is part of CEED, a collection of benchmarks, miniapps, software
*241a4b83SYohann// libraries and APIs for efficient high-order finite element and spectral
*241a4b83SYohann// element discretizations for exascale applications. For more information and
*241a4b83SYohann// source code availability see http://github.com/ceed.
*241a4b83SYohann//
*241a4b83SYohann// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
*241a4b83SYohann// a collaborative effort of two U.S. Department of Energy organizations (Office
*241a4b83SYohann// of Science and the National Nuclear Security Administration) responsible for
*241a4b83SYohann// the planning and preparation of a capable exascale ecosystem, including
*241a4b83SYohann// software, applications, hardware, advanced system engineering and early
*241a4b83SYohann// testbed platforms, in support of the nation's exascale computing imperative.
*241a4b83SYohann#include <ceed-backend.h>
*241a4b83SYohann#include "ceed-cuda-gen.h"
*241a4b83SYohann#include <iostream>
*241a4b83SYohann#include <sstream>
*241a4b83SYohann#include "../cuda/ceed-cuda.h"
*241a4b83SYohann#include "../cuda-reg/ceed-cuda-reg.h"
*241a4b83SYohann#include "../cuda-shared/ceed-cuda-shared.h"
*241a4b83SYohann
*241a4b83SYohannstatic const char *deviceFunctions = QUOTE(
*241a4b83SYohann
*241a4b83SYohanntypedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields;
*241a4b83SYohanntypedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt;
*241a4b83SYohann
*241a4b83SYohanntypedef struct {
*241a4b83SYohann  CeedInt tidx;
*241a4b83SYohann  CeedInt tidy;
*241a4b83SYohann  CeedInt tidz;
*241a4b83SYohann  CeedInt tid;
*241a4b83SYohann  CeedScalar* slice;
*241a4b83SYohann} BackendData;
*241a4b83SYohann
*241a4b83SYohann#if __CUDA_ARCH__ < 600
*241a4b83SYohann__device__ double atomicAdd(double *address, double val) {
*241a4b83SYohann  unsigned long long int *address_as_ull = (unsigned long long int *)address;
*241a4b83SYohann  unsigned long long int old = *address_as_ull, assumed;
*241a4b83SYohann  do {
*241a4b83SYohann    assumed = old;
*241a4b83SYohann    old =
*241a4b83SYohann      atomicCAS(address_as_ull, assumed,
*241a4b83SYohann                __double_as_longlong(val +
*241a4b83SYohann                                     __longlong_as_double(assumed)));
*241a4b83SYohann    // Note: uses integer comparison to avoid hang in case of NaN
*241a4b83SYohann    // (since NaN != NaN)
*241a4b83SYohann  } while (assumed != old);
*241a4b83SYohann  return __longlong_as_double(old);
*241a4b83SYohann}
*241a4b83SYohann#endif // __CUDA_ARCH__ < 600
*241a4b83SYohann
*241a4b83SYohanntemplate <int P, int Q>
*241a4b83SYohanninline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) {
*241a4b83SYohann  for(int i=data.tid; i<P*Q; i+=blockDim.x*blockDim.y*blockDim.z) {
*241a4b83SYohann    B[i] = d_B[i];
*241a4b83SYohann  }
*241a4b83SYohann  __syncthreads;
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohann//****
*241a4b83SYohann// 1D
*241a4b83SYohanntemplate <int NCOMP, int P1d>
*241a4b83SYohanninline __device__ void readDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
*241a4b83SYohann  if (data.tidx<P1d)
*241a4b83SYohann  {
*241a4b83SYohann    const CeedInt dof = data.tidx;
*241a4b83SYohann    const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
*241a4b83SYohann    for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann      r_u[comp] = d_u[ind + ndofs * comp];
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d>
*241a4b83SYohanninline __device__ void readDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
*241a4b83SYohann  if (data.tidx<P1d)
*241a4b83SYohann  {
*241a4b83SYohann    const CeedInt dof = data.tidx;
*241a4b83SYohann    const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
*241a4b83SYohann    for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann      r_u[comp] = d_u[ind * NCOMP + comp];
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int Q1d>
*241a4b83SYohanninline __device__ void readQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
*241a4b83SYohann  const CeedInt dof = data.tidx;
*241a4b83SYohann  const CeedInt ind = dof + elem * Q1d;
*241a4b83SYohann  for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann    r_u[comp] = d_u[ind + nquads * comp];
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int Q1d>
*241a4b83SYohanninline __device__ void readQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
*241a4b83SYohann  const CeedInt dof = data.tidx;
*241a4b83SYohann  const CeedInt ind = dof + elem * Q1d;
*241a4b83SYohann  for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann    r_u[comp] = d_u[ind * NCOMP + comp];
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d>
*241a4b83SYohanninline __device__ void writeDofs1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
*241a4b83SYohann  if (data.tidx<P1d)
*241a4b83SYohann  {
*241a4b83SYohann    const CeedInt dof = data.tidx;
*241a4b83SYohann    const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
*241a4b83SYohann    for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann      atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]);
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d>
*241a4b83SYohanninline __device__ void writeDofsTranspose1d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
*241a4b83SYohann  if (data.tidx<P1d)
*241a4b83SYohann  {
*241a4b83SYohann    const CeedInt dof = data.tidx;
*241a4b83SYohann    const CeedInt ind = indices ? indices[dof + elem * P1d] : dof + elem * P1d;
*241a4b83SYohann    for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann      atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int Q1d>
*241a4b83SYohanninline __device__ void writeQuads1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
*241a4b83SYohann  const CeedInt dof = data.tidx;
*241a4b83SYohann  const CeedInt ind = dof + elem * Q1d;
*241a4b83SYohann  for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann    d_v[ind + nquads * comp] = r_v[comp];
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int Q1d>
*241a4b83SYohanninline __device__ void writeQuadsTranspose1d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
*241a4b83SYohann  const CeedInt dof = data.tidx;
*241a4b83SYohann  const CeedInt ind = dof + elem * Q1d;
*241a4b83SYohann  for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann    d_v[ind * NCOMP + comp] = r_v[comp];
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractX1d(BackendData& data,
*241a4b83SYohann                                   const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  data.slice[data.tidx] = *U;
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann  *V = 0.0;
*241a4b83SYohann  for (int i = 0; i < P1d; ++i) {
*241a4b83SYohann    *V += B[i + data.tidx*P1d] * data.slice[i];//contract x direction
*241a4b83SYohann  }
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractTransposeX1d(BackendData& data,
*241a4b83SYohann                                            const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  data.slice[data.tidx] = *U;
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann  *V = 0.0;
*241a4b83SYohann  for (int i = 0; i < Q1d; ++i) {
*241a4b83SYohann    *V += B[data.tidx + i*P1d] * data.slice[i];//contract x direction
*241a4b83SYohann  }
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*241a4b83SYohann                                CeedScalar *__restrict__ r_V) {
*241a4b83SYohann  for(int comp=0; comp<NCOMP; comp++) {
*241a4b83SYohann    ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*241a4b83SYohann                                CeedScalar *__restrict__ r_V) {
*241a4b83SYohann  for(int comp=0; comp<NCOMP; comp++) {
*241a4b83SYohann    ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_V+comp);
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U,
*241a4b83SYohann                              const CeedScalar *c_B, const CeedScalar *c_G,
*241a4b83SYohann                              CeedScalar *__restrict__ r_V) {
*241a4b83SYohann  for(int comp=0; comp<NCOMP; comp++) {
*241a4b83SYohann    ContractX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U,
*241a4b83SYohann                              const CeedScalar *c_B, const CeedScalar *c_G,
*241a4b83SYohann                              CeedScalar *__restrict__ r_V) {
*241a4b83SYohann  for(int comp=0; comp<NCOMP; comp++) {
*241a4b83SYohann    ContractTransposeX1d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_V+comp);
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohann//****
*241a4b83SYohann// 2D
*241a4b83SYohanntemplate <int NCOMP, int P1d>
*241a4b83SYohanninline __device__ void readDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
*241a4b83SYohann  if (data.tidx<P1d && data.tidy<P1d)
*241a4b83SYohann  {
*241a4b83SYohann    const CeedInt dof = data.tidx + data.tidy*P1d;
*241a4b83SYohann    const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
*241a4b83SYohann    for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann      r_u[comp] = d_u[ind + ndofs * comp];
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d>
*241a4b83SYohanninline __device__ void readDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
*241a4b83SYohann  if (data.tidx<P1d && data.tidy<P1d)
*241a4b83SYohann  {
*241a4b83SYohann    const CeedInt dof = data.tidx + data.tidy*P1d;
*241a4b83SYohann    const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
*241a4b83SYohann    for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann      r_u[comp] = d_u[ind * NCOMP + comp];
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int Q1d>
*241a4b83SYohanninline __device__ void readQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
*241a4b83SYohann  const CeedInt dof = data.tidx + data.tidy*Q1d;
*241a4b83SYohann  const CeedInt ind = dof + elem * Q1d*Q1d;
*241a4b83SYohann  for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann    r_u[comp] = d_u[ind + nquads * comp];
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int Q1d>
*241a4b83SYohanninline __device__ void readQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
*241a4b83SYohann  const CeedInt dof = data.tidx + data.tidy*Q1d;
*241a4b83SYohann  const CeedInt ind = dof + elem * Q1d*Q1d;
*241a4b83SYohann  for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann    r_u[comp] = d_u[ind * NCOMP + comp];
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d>
*241a4b83SYohanninline __device__ void writeDofs2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
*241a4b83SYohann  if (data.tidx<P1d && data.tidy<P1d)
*241a4b83SYohann  {
*241a4b83SYohann    const CeedInt dof = data.tidx + data.tidy*P1d;
*241a4b83SYohann    const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
*241a4b83SYohann    for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann      atomicAdd(&d_v[ind + ndofs * comp], r_v[comp]);
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d>
*241a4b83SYohanninline __device__ void writeDofsTranspose2d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
*241a4b83SYohann  if (data.tidx<P1d && data.tidy<P1d)
*241a4b83SYohann  {
*241a4b83SYohann    const CeedInt dof = data.tidx + data.tidy*P1d;
*241a4b83SYohann    const CeedInt ind = indices ? indices[dof + elem * P1d*P1d] : dof + elem * P1d*P1d;
*241a4b83SYohann    for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann      atomicAdd(&d_v[ind * NCOMP + comp], r_v[comp]);
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int Q1d>
*241a4b83SYohanninline __device__ void writeQuads2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
*241a4b83SYohann  const CeedInt dof = data.tidx + data.tidy*Q1d;
*241a4b83SYohann  const CeedInt ind = dof + elem * Q1d*Q1d;
*241a4b83SYohann  for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann    d_v[ind + nquads * comp] = r_v[comp];
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int Q1d>
*241a4b83SYohanninline __device__ void writeQuadsTranspose2d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
*241a4b83SYohann  const CeedInt dof = data.tidx + data.tidy*Q1d;
*241a4b83SYohann  const CeedInt ind = dof + elem * Q1d*Q1d;
*241a4b83SYohann  for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann    d_v[ind * NCOMP + comp] = r_v[comp];
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractX2d(BackendData& data,
*241a4b83SYohann                                   const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  data.slice[data.tidx+data.tidy*Q1d] = *U;
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann  *V = 0.0;
*241a4b83SYohann  for (int i = 0; i < P1d; ++i) {
*241a4b83SYohann    *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
*241a4b83SYohann  }
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractY2d(BackendData& data,
*241a4b83SYohann                                   const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  data.slice[data.tidx+data.tidy*Q1d] = *U;
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann  *V = 0.0;
*241a4b83SYohann  for (int i = 0; i < P1d; ++i) {
*241a4b83SYohann    *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
*241a4b83SYohann  }
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractYTranspose2d(BackendData& data,
*241a4b83SYohann                                            const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  data.slice[data.tidx+data.tidy*Q1d] = *U;
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann  *V = 0.0;
*241a4b83SYohann  if (data.tidy<P1d) {
*241a4b83SYohann    for (int i = 0; i < Q1d; ++i) {
*241a4b83SYohann      *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractXTranspose2d(BackendData& data,
*241a4b83SYohann                                            const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  data.slice[data.tidx+data.tidy*Q1d] = *U;
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann  *V = 0.0;
*241a4b83SYohann  if (data.tidx<P1d) {
*241a4b83SYohann    for (int i = 0; i < Q1d; ++i) {
*241a4b83SYohann      *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractXTransposeAdd2d(BackendData& data,
*241a4b83SYohann                                            const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  data.slice[data.tidx+data.tidy*Q1d] = *U;
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann  if (data.tidx<P1d) {
*241a4b83SYohann    for (int i = 0; i < Q1d; ++i) {
*241a4b83SYohann      *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann  __syncthreads();
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*241a4b83SYohann                                CeedScalar *__restrict__ r_V) {
*241a4b83SYohann  CeedScalar r_t[1];
*241a4b83SYohann  for(int comp=0; comp<NCOMP; comp++) {
*241a4b83SYohann    ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
*241a4b83SYohann    ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*241a4b83SYohann                                CeedScalar *__restrict__ r_V) {
*241a4b83SYohann  CeedScalar r_t[1];
*241a4b83SYohann  for(int comp=0; comp<NCOMP; comp++) {
*241a4b83SYohann    ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
*241a4b83SYohann    ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U,
*241a4b83SYohann                              const CeedScalar *c_B, const CeedScalar *c_G,
*241a4b83SYohann                              CeedScalar *__restrict__ r_V) {
*241a4b83SYohann  CeedScalar r_t[1];
*241a4b83SYohann  for(int comp=0; comp<NCOMP; comp++) {
*241a4b83SYohann    ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_G, r_t);
*241a4b83SYohann    ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp+0*NCOMP);
*241a4b83SYohann    ContractX2d<NCOMP,P1d,Q1d>(data, r_U+comp, c_B, r_t);
*241a4b83SYohann    ContractY2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp+1*NCOMP);
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U,
*241a4b83SYohann                              const CeedScalar *c_B, const CeedScalar *c_G,
*241a4b83SYohann                              CeedScalar *__restrict__ r_V) {
*241a4b83SYohann  CeedScalar r_t[1];
*241a4b83SYohann  for(int comp=0; comp<NCOMP; comp++) {
*241a4b83SYohann    ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+0*NCOMP, c_B, r_t);
*241a4b83SYohann    ContractXTranspose2d<NCOMP,P1d,Q1d>(data, r_t, c_G, r_V+comp);
*241a4b83SYohann    ContractYTranspose2d<NCOMP,P1d,Q1d>(data, r_U+comp+1*NCOMP, c_G, r_t);
*241a4b83SYohann    ContractXTransposeAdd2d<NCOMP,P1d,Q1d>(data, r_t, c_B, r_V+comp);
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohann//****
*241a4b83SYohann// 3D
*241a4b83SYohanntemplate <int NCOMP, int P1d>
*241a4b83SYohanninline __device__ void readDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
*241a4b83SYohann  if (data.tidx<P1d && data.tidy<P1d) {
*241a4b83SYohann    for (CeedInt z = 0; z < P1d; ++z) {
*241a4b83SYohann      const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
*241a4b83SYohann      const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
*241a4b83SYohann      for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann        r_u[z+comp*P1d] = d_u[ind + ndofs * comp];
*241a4b83SYohann      }
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d>
*241a4b83SYohanninline __device__ void readDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
*241a4b83SYohann  if (data.tidx<P1d && data.tidy<P1d) {
*241a4b83SYohann    for (CeedInt z = 0; z < P1d; ++z) {
*241a4b83SYohann      const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
*241a4b83SYohann      const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
*241a4b83SYohann      for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann        r_u[z+comp*P1d] = d_u[ind * NCOMP + comp];
*241a4b83SYohann      }
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int Q1d>
*241a4b83SYohanninline __device__ void readQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
*241a4b83SYohann  for(CeedInt z=0; z < Q1d; ++z) {
*241a4b83SYohann    const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
*241a4b83SYohann    const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
*241a4b83SYohann    for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann      r_u[z+comp*Q1d] = d_u[ind + nquads * comp];
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int Q1d>
*241a4b83SYohanninline __device__ void readQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
*241a4b83SYohann  for(CeedInt z=0; z < Q1d; ++z) {
*241a4b83SYohann    const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
*241a4b83SYohann    const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
*241a4b83SYohann    for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann      r_u[z+comp*Q1d] = d_u[ind * NCOMP + comp];
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d>
*241a4b83SYohanninline __device__ void writeDofs3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
*241a4b83SYohann  if (data.tidx<P1d && data.tidy<P1d) {
*241a4b83SYohann    for (CeedInt z = 0; z < P1d; ++z) {
*241a4b83SYohann      const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
*241a4b83SYohann      const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
*241a4b83SYohann      for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann        atomicAdd(&d_v[ind + ndofs * comp], r_v[z+comp*P1d]);
*241a4b83SYohann      }
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d>
*241a4b83SYohanninline __device__ void writeDofsTranspose3d(BackendData& data, const CeedInt ndofs, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
*241a4b83SYohann  if (data.tidx<P1d && data.tidy<P1d) {
*241a4b83SYohann    for (CeedInt z = 0; z < P1d; ++z) {
*241a4b83SYohann      const CeedInt dof = data.tidx + data.tidy*P1d + z*P1d*P1d;
*241a4b83SYohann      const CeedInt ind = indices ? indices[dof + elem * P1d*P1d*P1d] : dof + elem * P1d*P1d*P1d;
*241a4b83SYohann      for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann        atomicAdd(&d_v[ind * NCOMP + comp], r_v[z+comp*P1d]);
*241a4b83SYohann      }
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int Q1d>
*241a4b83SYohanninline __device__ void writeQuads3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
*241a4b83SYohann  for(CeedInt z=0; z < Q1d; ++z) {
*241a4b83SYohann    const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
*241a4b83SYohann    const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
*241a4b83SYohann    for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann      d_v[ind + nquads * comp] = r_v[z+comp*Q1d];
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int Q1d>
*241a4b83SYohanninline __device__ void writeQuadsTranspose3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
*241a4b83SYohann  for(CeedInt z=0; z < Q1d; ++z) {
*241a4b83SYohann    const CeedInt dof = data.tidx + data.tidy*Q1d + z*Q1d*Q1d;
*241a4b83SYohann    const CeedInt ind = dof + elem * Q1d*Q1d*Q1d;
*241a4b83SYohann    for(CeedInt comp = 0; comp < NCOMP; ++comp) {
*241a4b83SYohann      d_v[ind * NCOMP + comp] = r_v[z+comp*Q1d];
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractX3d(BackendData& data,
*241a4b83SYohann                                   const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  for (int k = 0; k < P1d; ++k) {
*241a4b83SYohann    data.slice[data.tidx+data.tidy*Q1d] = U[k];
*241a4b83SYohann    __syncthreads();
*241a4b83SYohann    V[k] = 0.0;
*241a4b83SYohann    for (int i = 0; i < P1d; ++i) {
*241a4b83SYohann      V[k] += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
*241a4b83SYohann    }
*241a4b83SYohann    __syncthreads();
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractY3d(BackendData& data,
*241a4b83SYohann                                   const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  for (int k = 0; k < P1d; ++k) {
*241a4b83SYohann    data.slice[data.tidx+data.tidy*Q1d] = U[k];
*241a4b83SYohann    __syncthreads();
*241a4b83SYohann    V[k] = 0.0;
*241a4b83SYohann    for (int i = 0; i < P1d; ++i) {
*241a4b83SYohann      V[k] += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
*241a4b83SYohann    }
*241a4b83SYohann    __syncthreads();
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractZ3d(BackendData& data,
*241a4b83SYohann                                   const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  for (int k = 0; k < Q1d; ++k) {
*241a4b83SYohann    V[k] = 0.0;
*241a4b83SYohann    for (int i = 0; i < P1d; ++i) {
*241a4b83SYohann      V[k] += B[i + k*P1d] * U[i];//contract z direction
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractTransposeZ3d(BackendData& data,
*241a4b83SYohann    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  for (int k = 0; k < Q1d; ++k) {
*241a4b83SYohann    V[k] = 0.0;
*241a4b83SYohann    if (k<P1d) {
*241a4b83SYohann      for (int i = 0; i < Q1d; ++i) {
*241a4b83SYohann        V[k] += B[k + i*P1d] * U[i];//contract z direction
*241a4b83SYohann      }
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractTransposeY3d(BackendData& data,
*241a4b83SYohann    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  for (int k = 0; k < P1d; ++k) {
*241a4b83SYohann    data.slice[data.tidx+data.tidy*Q1d] = U[k];
*241a4b83SYohann    __syncthreads();
*241a4b83SYohann    V[k] = 0.0;
*241a4b83SYohann    if (data.tidy<P1d) {
*241a4b83SYohann      for (int i = 0; i < Q1d; ++i) {
*241a4b83SYohann        V[k] += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d];//contract y direction
*241a4b83SYohann      }
*241a4b83SYohann    }
*241a4b83SYohann    __syncthreads();
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractTransposeX3d(BackendData& data,
*241a4b83SYohann    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  for (int k = 0; k < P1d; ++k) {
*241a4b83SYohann    data.slice[data.tidx+data.tidy*Q1d] = U[k];
*241a4b83SYohann    __syncthreads();
*241a4b83SYohann    V[k] = 0.0;
*241a4b83SYohann    if (data.tidx<P1d) {
*241a4b83SYohann      for (int i = 0; i < Q1d; ++i) {
*241a4b83SYohann        V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
*241a4b83SYohann      }
*241a4b83SYohann    }
*241a4b83SYohann    __syncthreads();
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void ContractTransposeAddX3d(BackendData& data,
*241a4b83SYohann    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
*241a4b83SYohann  for (int k = 0; k < P1d; ++k) {
*241a4b83SYohann    data.slice[data.tidx+data.tidy*Q1d] = U[k];
*241a4b83SYohann    __syncthreads();
*241a4b83SYohann    if (data.tidx<P1d) {
*241a4b83SYohann      for (int i = 0; i < Q1d; ++i) {
*241a4b83SYohann        V[k] += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d];//contract x direction
*241a4b83SYohann      }
*241a4b83SYohann    }
*241a4b83SYohann    __syncthreads();
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*241a4b83SYohann                                CeedScalar *__restrict__ r_V) {
*241a4b83SYohann  CeedScalar r_t1[Q1d];
*241a4b83SYohann  CeedScalar r_t2[Q1d];
*241a4b83SYohann  for(int comp=0; comp<NCOMP; comp++) {
*241a4b83SYohann    ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
*241a4b83SYohann    ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
*241a4b83SYohann    ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d);
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
*241a4b83SYohann                                CeedScalar *__restrict__ r_V) {
*241a4b83SYohann  CeedScalar r_t1[Q1d];
*241a4b83SYohann  CeedScalar r_t2[Q1d];
*241a4b83SYohann  for(int comp=0; comp<NCOMP; comp++) {
*241a4b83SYohann    ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d, c_B, r_t1);
*241a4b83SYohann    ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
*241a4b83SYohann    ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U,
*241a4b83SYohann                              const CeedScalar *c_B, const CeedScalar *c_G,
*241a4b83SYohann                              CeedScalar *__restrict__ r_V) {
*241a4b83SYohann  CeedScalar r_t1[Q1d];
*241a4b83SYohann  CeedScalar r_t2[Q1d];
*241a4b83SYohann  for(int comp=0; comp<NCOMP; comp++) {
*241a4b83SYohann    ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_G, r_t1);
*241a4b83SYohann    ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
*241a4b83SYohann    ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+0*NCOMP*Q1d);
*241a4b83SYohann    ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
*241a4b83SYohann    ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
*241a4b83SYohann    ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*Q1d+1*NCOMP*Q1d);
*241a4b83SYohann    ContractX3d<NCOMP,P1d,Q1d>(data, r_U+comp*P1d, c_B, r_t1);
*241a4b83SYohann    ContractY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
*241a4b83SYohann    ContractZ3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*Q1d+2*NCOMP*Q1d);
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int NCOMP, int P1d, int Q1d>
*241a4b83SYohanninline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U,
*241a4b83SYohann                              const CeedScalar *c_B, const CeedScalar *c_G,
*241a4b83SYohann                              CeedScalar *__restrict__ r_V) {
*241a4b83SYohann  CeedScalar r_t1[Q1d];
*241a4b83SYohann  CeedScalar r_t2[Q1d];
*241a4b83SYohann  for(int comp=0; comp<NCOMP; comp++) {
*241a4b83SYohann    ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+0*NCOMP*Q1d, c_B, r_t1);
*241a4b83SYohann    ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
*241a4b83SYohann    ContractTransposeX3d<NCOMP,P1d,Q1d>(data, r_t2, c_G, r_V+comp*P1d);
*241a4b83SYohann    ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+1*NCOMP*Q1d, c_B, r_t1);
*241a4b83SYohann    ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_G, r_t2);
*241a4b83SYohann    ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
*241a4b83SYohann    ContractTransposeZ3d<NCOMP,P1d,Q1d>(data, r_U+comp*Q1d+2*NCOMP*Q1d, c_G, r_t1);
*241a4b83SYohann    ContractTransposeY3d<NCOMP,P1d,Q1d>(data, r_t1, c_B, r_t2);
*241a4b83SYohann    ContractTransposeAddX3d<NCOMP,P1d,Q1d>(data, r_t2, c_B, r_V+comp*P1d);
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int Q1d>
*241a4b83SYohanninline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
*241a4b83SYohann  *w = qweight1d[data.tidx];
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int Q1d>
*241a4b83SYohanninline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
*241a4b83SYohann  *w = qweight1d[data.tidx]*qweight1d[data.tidy];
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohanntemplate <int Q1d>
*241a4b83SYohanninline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) {
*241a4b83SYohann  const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy];
*241a4b83SYohann  for (int z = 0; z < Q1d; ++z)
*241a4b83SYohann  {
*241a4b83SYohann    w[z] = pw*qweight1d[z];
*241a4b83SYohann  }
*241a4b83SYohann}
*241a4b83SYohann
*241a4b83SYohann);
*241a4b83SYohann
*241a4b83SYohannextern "C" int CeedCudaGenOperatorBuild(CeedOperator op) {
*241a4b83SYohann
*241a4b83SYohann	using std::ostringstream;
*241a4b83SYohann  using std::string;
*241a4b83SYohann  int ierr;
*241a4b83SYohann  bool setupdone;
*241a4b83SYohann  ierr = CeedOperatorGetSetupStatus(op, &setupdone); CeedChk(ierr);
*241a4b83SYohann  if (setupdone) return 0;
*241a4b83SYohann  Ceed ceed;
*241a4b83SYohann  ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
*241a4b83SYohann  CeedOperator_Cuda_gen *data;
*241a4b83SYohann  ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr);
*241a4b83SYohann  CeedQFunction qf;
*241a4b83SYohann  CeedQFunction_Cuda_gen *qf_data;
*241a4b83SYohann  ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr);
*241a4b83SYohann  ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr);
*241a4b83SYohann  CeedInt Q, P1d, Q1d = -1, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim, ndof;
*241a4b83SYohann  ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr);
*241a4b83SYohann  ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr);
*241a4b83SYohann  ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields);
*241a4b83SYohann  CeedChk(ierr);
*241a4b83SYohann  CeedOperatorField *opinputfields, *opoutputfields;
*241a4b83SYohann  ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields);
*241a4b83SYohann  CeedChk(ierr);
*241a4b83SYohann  CeedQFunctionField *qfinputfields, *qfoutputfields;
*241a4b83SYohann  ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields);
*241a4b83SYohann  CeedChk(ierr);
*241a4b83SYohann  CeedEvalMode emode;
*241a4b83SYohann  CeedTransposeMode lmode;
*241a4b83SYohann  CeedBasis basis;
*241a4b83SYohann  CeedBasis_Cuda_shared *basis_data;
*241a4b83SYohann  CeedElemRestriction Erestrict;
*241a4b83SYohann  CeedElemRestriction_Cuda_reg *restr_data;
*241a4b83SYohann
*241a4b83SYohann  ostringstream code;
*241a4b83SYohann  string devFunctions(deviceFunctions);
*241a4b83SYohann
*241a4b83SYohann  code << devFunctions;
*241a4b83SYohann
*241a4b83SYohann  string qFunction(qf_data->qFunctionSource);
*241a4b83SYohann  code << qFunction;
*241a4b83SYohann
*241a4b83SYohann  // Setup
*241a4b83SYohann  code << "\nextern \"C\" __global__ void oper(CeedInt nelem, void* ctx, CudaFieldsInt indices, CudaFields fields, CudaFields B, CudaFields G, CeedScalar* W) {\n";
*241a4b83SYohann  // Input Evecs and Restriction
*241a4b83SYohann  for (CeedInt i = 0; i < numinputfields; i++) {
*241a4b83SYohann    ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    if (emode == CEED_EVAL_WEIGHT) { // Skip
*241a4b83SYohann    } else {
*241a4b83SYohann      code << "const CeedScalar* d_u" <<i<<" = fields.in["<<i<<"];\n";
*241a4b83SYohann      if (emode != CEED_EVAL_NONE)
*241a4b83SYohann      {
*241a4b83SYohann        ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
*241a4b83SYohann        bool isTensor;
*241a4b83SYohann        ierr = CeedBasisGetTensorStatus(basis, &isTensor); CeedChk(ierr);
*241a4b83SYohann        //TODO check that all are the same
*241a4b83SYohann        ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
*241a4b83SYohann        if (isTensor)
*241a4b83SYohann        {
*241a4b83SYohann          //TODO check that all are the same
*241a4b83SYohann          ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
*241a4b83SYohann        } else {
*241a4b83SYohann          return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis");
*241a4b83SYohann        }
*241a4b83SYohann      }
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann  data->dim = dim;
*241a4b83SYohann  data->Q1d = Q1d;
*241a4b83SYohann
*241a4b83SYohann  for (CeedInt i = 0; i < numoutputfields; i++) {
*241a4b83SYohann    code << "CeedScalar* d_v"<<i<<" = fields.out["<<i<<"];\n";
*241a4b83SYohann  }
*241a4b83SYohann  code << "const CeedInt Dim = "<<dim<<";\n";
*241a4b83SYohann  code << "const CeedInt Q1d = "<<Q1d<<";\n";
*241a4b83SYohann  // code << "const CeedInt Q   = "<<Q<<";\n";
*241a4b83SYohann  code << "extern __shared__ CeedScalar slice[];\n";
*241a4b83SYohann  code << "BackendData data;\n";
*241a4b83SYohann  code << "data.tidx = threadIdx.x;\n";
*241a4b83SYohann  code << "data.tidy = threadIdx.y;\n";
*241a4b83SYohann  code << "data.tidz = threadIdx.z;\n";
*241a4b83SYohann  code << "data.tid  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
*241a4b83SYohann  code << "data.slice = slice+data.tidz*Q1d"<<(dim>1?"*Q1d":"")<<";\n";
*241a4b83SYohann  for (CeedInt i = 0; i < numinputfields; i++) {
*241a4b83SYohann    code << "// Input field "<<i<<"\n";
*241a4b83SYohann    // Get elemsize, emode, ncomp
*241a4b83SYohann    ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    // Basis action
*241a4b83SYohann    switch (emode) {
*241a4b83SYohann    case CEED_EVAL_NONE:
*241a4b83SYohann      ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
*241a4b83SYohann      code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
*241a4b83SYohann      code << "  const CeedInt nquads_in_"<<i<<" = "<<ndof/ncomp<<";\n";
*241a4b83SYohann      break;
*241a4b83SYohann    case CEED_EVAL_INTERP:
*241a4b83SYohann      ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
*241a4b83SYohann      ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
*241a4b83SYohann      ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
*241a4b83SYohann      code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
*241a4b83SYohann      code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
*241a4b83SYohann      code << "  const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n";
*241a4b83SYohann      ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
*241a4b83SYohann      data->B.in[i] = basis_data->d_interp1d;
*241a4b83SYohann      code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
*241a4b83SYohann      code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
*241a4b83SYohann      break;
*241a4b83SYohann    case CEED_EVAL_GRAD:
*241a4b83SYohann      ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
*241a4b83SYohann      ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
*241a4b83SYohann      code << "  const CeedInt ncomp_in_"<<i<<" = "<<ncomp<<";\n";
*241a4b83SYohann      code << "  const CeedInt ndofs_in_"<<i<<" = "<<ndof<<";\n";
*241a4b83SYohann      ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
*241a4b83SYohann      code << "  const CeedInt P_in_"<<i<<" = "<<P1d<<";\n";
*241a4b83SYohann      ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
*241a4b83SYohann      data->B.in[i] = basis_data->d_interp1d;
*241a4b83SYohann      data->G.in[i] = basis_data->d_grad1d;
*241a4b83SYohann      code << "  __shared__ double s_B_in_"<<i<<"["<<P1d*Q1d<<"];\n";
*241a4b83SYohann      code << "  __shared__ double s_G_in_"<<i<<"["<<P1d*Q1d<<"];\n";
*241a4b83SYohann      code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, B.in["<<i<<"], s_B_in_"<<i<<");\n";
*241a4b83SYohann      code << "  loadMatrix<P_in_"<<i<<",Q1d>(data, G.in["<<i<<"], s_G_in_"<<i<<");\n";
*241a4b83SYohann      break;
*241a4b83SYohann    case CEED_EVAL_WEIGHT:
*241a4b83SYohann      break; // No action
*241a4b83SYohann    case CEED_EVAL_DIV:
*241a4b83SYohann      break; // TODO: Not implemented
*241a4b83SYohann    case CEED_EVAL_CURL:
*241a4b83SYohann      break; // TODO: Not implemented
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann  for (CeedInt i = 0; i < numoutputfields; i++) {
*241a4b83SYohann    code << "// Output field "<<i<<"\n";
*241a4b83SYohann    // Get elemsize, emode, ncomp
*241a4b83SYohann    ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    // Basis action
*241a4b83SYohann    switch (emode) {
*241a4b83SYohann    case CEED_EVAL_NONE:
*241a4b83SYohann      code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
*241a4b83SYohann      ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
*241a4b83SYohann      ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
*241a4b83SYohann      code << "  const CeedInt nquads_out_"<<i<<" = "<<ndof<<"/ncomp_out_"<<i<<";\n";
*241a4b83SYohann      break; // No action
*241a4b83SYohann    case CEED_EVAL_INTERP:
*241a4b83SYohann      code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
*241a4b83SYohann      ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
*241a4b83SYohann      ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
*241a4b83SYohann      code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
*241a4b83SYohann      ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
*241a4b83SYohann      data->B.out[i] = basis_data->d_interp1d;
*241a4b83SYohann      code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
*241a4b83SYohann      code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
*241a4b83SYohann      ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
*241a4b83SYohann      code << "  const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n";
*241a4b83SYohann      break;
*241a4b83SYohann    case CEED_EVAL_GRAD:
*241a4b83SYohann      code << "  const CeedInt ncomp_out_"<<i<<" = "<<ncomp<<";\n";
*241a4b83SYohann      ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr);
*241a4b83SYohann      ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
*241a4b83SYohann      code << "  const CeedInt P_out_"<<i<<" = "<<P1d<<";\n";
*241a4b83SYohann      ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
*241a4b83SYohann      data->B.out[i] = basis_data->d_interp1d;
*241a4b83SYohann      data->G.out[i] = basis_data->d_grad1d;
*241a4b83SYohann      code << "  __shared__ double s_B_out_"<<i<<"["<<P1d*Q1d<<"];\n";
*241a4b83SYohann      code << "  __shared__ double s_G_out_"<<i<<"["<<P1d*Q1d<<"];\n";
*241a4b83SYohann      code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, B.out["<<i<<"], s_B_out_"<<i<<");\n";
*241a4b83SYohann      code << "  loadMatrix<P_out_"<<i<<",Q1d>(data, G.out["<<i<<"], s_G_out_"<<i<<");\n";
*241a4b83SYohann      ierr = CeedElemRestrictionGetNumDoF(Erestrict, &ndof); CeedChk(ierr);
*241a4b83SYohann      code << "  const CeedInt ndofs_out_"<<i<<" = "<<ndof<<";\n";
*241a4b83SYohann      break;
*241a4b83SYohann    case CEED_EVAL_WEIGHT: {
*241a4b83SYohann      Ceed ceed;
*241a4b83SYohann      ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
*241a4b83SYohann      return CeedError(ceed, 1,
*241a4b83SYohann                       "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
*241a4b83SYohann      break; // Should not occur
*241a4b83SYohann    }
*241a4b83SYohann    case CEED_EVAL_DIV:
*241a4b83SYohann      break; // TODO: Not implemented
*241a4b83SYohann    case CEED_EVAL_CURL:
*241a4b83SYohann      break; // TODO: Not implemented
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann  code << "for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; elem += gridDim.x*blockDim.z) {\n";
*241a4b83SYohann  // Input basis apply if needed
*241a4b83SYohann  for (CeedInt i = 0; i < numinputfields; i++) {
*241a4b83SYohann    code << "// Input field "<<i<<"\n";
*241a4b83SYohann    // Get elemsize, emode, ncomp
*241a4b83SYohann    ierr = CeedOperatorFieldGetElemRestriction(opinputfields[i], &Erestrict);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    ierr = CeedQFunctionFieldGetEvalMode(qfinputfields[i], &emode);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    ierr = CeedQFunctionFieldGetNumComponents(qfinputfields[i], &ncomp);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    // Basis action
*241a4b83SYohann    switch (emode) {
*241a4b83SYohann    case CEED_EVAL_NONE:
*241a4b83SYohann      code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
*241a4b83SYohann      ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
*241a4b83SYohann      code << "  readQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",Q1d>(data, nquads_in_"<<i<<", elem, d_u"<<i<<", r_t"<<i<<");\n";
*241a4b83SYohann      break;
*241a4b83SYohann    case CEED_EVAL_INTERP:
*241a4b83SYohann      code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
*241a4b83SYohann      ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
*241a4b83SYohann      ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
*241a4b83SYohann      data->indices.in[i] = restr_data->d_ind;
*241a4b83SYohann      code << "  readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
*241a4b83SYohann      code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Q1d];\n";
*241a4b83SYohann      code << "  interp"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", r_t"<<i<<");\n";
*241a4b83SYohann      break;
*241a4b83SYohann    case CEED_EVAL_GRAD:
*241a4b83SYohann      code << "  CeedScalar r_u"<<i<<"[ncomp_in_"<<i<<"*P_in_"<<i<<"];\n";
*241a4b83SYohann      ierr = CeedOperatorFieldGetLMode(opinputfields[i], &lmode); CeedChk(ierr);
*241a4b83SYohann      ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
*241a4b83SYohann      data->indices.in[i] = restr_data->d_ind;
*241a4b83SYohann      code << "  readDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<">(data, ndofs_in_"<<i<<", elem, indices.in["<<i<<"], d_u"<<i<<", r_u"<<i<<");\n";
*241a4b83SYohann      code << "  CeedScalar r_t"<<i<<"[ncomp_in_"<<i<<"*Dim*Q1d];\n";
*241a4b83SYohann      code << "  grad"<<dim<<"d<ncomp_in_"<<i<<",P_in_"<<i<<",Q1d>(data, r_u"<<i<<", s_B_in_"<<i<<", s_G_in_"<<i<<", r_t"<<i<<");\n";
*241a4b83SYohann      break;
*241a4b83SYohann    case CEED_EVAL_WEIGHT:
*241a4b83SYohann      code << "  CeedScalar r_t"<<i<<"[Q1d];\n";
*241a4b83SYohann      ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr);
*241a4b83SYohann      ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr);
*241a4b83SYohann      data->W = basis_data->d_qweight1d;
*241a4b83SYohann      code << "  weight"<<dim<<"d<Q1d>(data, W, r_t"<<i<<");\n";
*241a4b83SYohann      break; // No action
*241a4b83SYohann    case CEED_EVAL_DIV:
*241a4b83SYohann      break; // TODO: Not implemented
*241a4b83SYohann    case CEED_EVAL_CURL:
*241a4b83SYohann      break; // TODO: Not implemented
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann  // Q function
*241a4b83SYohann  code << "// QFunction\n";
*241a4b83SYohann  for (CeedInt i = 0; i < numoutputfields; i++) {
*241a4b83SYohann    ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    if (emode==CEED_EVAL_GRAD)
*241a4b83SYohann    {
*241a4b83SYohann      code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Dim*Q1d];\n";
*241a4b83SYohann    }
*241a4b83SYohann    if (emode==CEED_EVAL_NONE || emode==CEED_EVAL_INTERP)
*241a4b83SYohann    {
*241a4b83SYohann      code << "  CeedScalar r_tt"<<i<<"[ncomp_out_"<<i<<"*Q1d];\n";
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann  //TODO write qfunction load for this backend
*241a4b83SYohann  string qFunctionName(qf_data->qFunctionName);
*241a4b83SYohann  code << "  "<<qFunctionName<<"(ctx, "<<(dim==3?"Q1d":"1")<<", ";
*241a4b83SYohann  for (CeedInt i = 0; i < numinputfields; i++) {
*241a4b83SYohann    code << "r_t"<<i<<", ";
*241a4b83SYohann  }
*241a4b83SYohann  for (CeedInt i = 0; i < numoutputfields; i++) {
*241a4b83SYohann    code << "r_tt"<<i;
*241a4b83SYohann    if (i<numoutputfields-1)
*241a4b83SYohann    {
*241a4b83SYohann      code << ", ";
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann  code << ");\n";
*241a4b83SYohann
*241a4b83SYohann  // Output basis apply if needed
*241a4b83SYohann  for (CeedInt i = 0; i < numoutputfields; i++) {
*241a4b83SYohann    code << "// Output field "<<i<<"\n";
*241a4b83SYohann    // Get elemsize, emode, ncomp
*241a4b83SYohann    ierr = CeedOperatorFieldGetElemRestriction(opoutputfields[i], &Erestrict);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    ierr = CeedElemRestrictionGetElementSize(Erestrict, &elemsize);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    ierr = CeedQFunctionFieldGetEvalMode(qfoutputfields[i], &emode);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    ierr = CeedQFunctionFieldGetNumComponents(qfoutputfields[i], &ncomp);
*241a4b83SYohann    CeedChk(ierr);
*241a4b83SYohann    // Basis action
*241a4b83SYohann    switch (emode) {
*241a4b83SYohann    case CEED_EVAL_NONE:
*241a4b83SYohann      ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
*241a4b83SYohann      code << "  writeQuads"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",Q1d>(data, nquads_out_"<<i<<", elem, r_tt"<<i<<", d_v"<<i<<");\n";
*241a4b83SYohann      break; // No action
*241a4b83SYohann    case CEED_EVAL_INTERP:
*241a4b83SYohann      code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
*241a4b83SYohann      code << "  interpTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", r_v"<<i<<");\n";
*241a4b83SYohann      ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
*241a4b83SYohann      ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
*241a4b83SYohann      data->indices.out[i] = restr_data->d_ind;
*241a4b83SYohann      code << "  writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
*241a4b83SYohann      break;
*241a4b83SYohann    case CEED_EVAL_GRAD:
*241a4b83SYohann      code << "  CeedScalar r_v"<<i<<"[ncomp_out_"<<i<<"*P_out_"<<i<<"];\n";
*241a4b83SYohann      code << "  gradTranspose"<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<",Q1d>(data, r_tt"<<i<<", s_B_out_"<<i<<", s_G_out_"<<i<<", r_v"<<i<<");\n";
*241a4b83SYohann      ierr = CeedOperatorFieldGetLMode(opoutputfields[i], &lmode); CeedChk(ierr);
*241a4b83SYohann      ierr = CeedElemRestrictionGetData(Erestrict, (void **)&restr_data); CeedChk(ierr);
*241a4b83SYohann      data->indices.out[i] = restr_data->d_ind;
*241a4b83SYohann      code << "  writeDofs"<<(lmode==CEED_NOTRANSPOSE?"":"Transpose")<<dim<<"d<ncomp_out_"<<i<<",P_out_"<<i<<">(data, ndofs_out_"<<i<<", elem, indices.out["<<i<<"], r_v"<<i<<", d_v"<<i<<");\n";
*241a4b83SYohann      break;
*241a4b83SYohann    case CEED_EVAL_WEIGHT: {
*241a4b83SYohann      Ceed ceed;
*241a4b83SYohann      ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr);
*241a4b83SYohann      return CeedError(ceed, 1,
*241a4b83SYohann                       "CEED_EVAL_WEIGHT cannot be an output evaluation mode");
*241a4b83SYohann      break; // Should not occur
*241a4b83SYohann    }
*241a4b83SYohann    case CEED_EVAL_DIV:
*241a4b83SYohann      break; // TODO: Not implemented
*241a4b83SYohann    case CEED_EVAL_CURL:
*241a4b83SYohann      break; // TODO: Not implemented
*241a4b83SYohann    }
*241a4b83SYohann  }
*241a4b83SYohann
*241a4b83SYohann  code << "  }\n";
*241a4b83SYohann  code << "}\n\n";
*241a4b83SYohann
*241a4b83SYohann  // std::cout << code.str();
*241a4b83SYohann
*241a4b83SYohann  ierr = CeedCompileCuda(ceed, code.str().c_str(), &data->module, 0); CeedChk(ierr);
*241a4b83SYohann  ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op);
*241a4b83SYohann  CeedChk(ierr);
*241a4b83SYohann
*241a4b83SYohann  ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr);
*241a4b83SYohann
*241a4b83SYohann  return 0;
*241a4b83SYohann}