backends/cuda-shared/ceed-cuda-shared-basis.c

c532df63SYohann// Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
c532df63SYohann// Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
c532df63SYohann// All Rights reserved. See files LICENSE and NOTICE for details.
c532df63SYohann//
c532df63SYohann// This file is part of CEED, a collection of benchmarks, miniapps, software
c532df63SYohann// libraries and APIs for efficient high-order finite element and spectral
c532df63SYohann// element discretizations for exascale applications. For more information and
c532df63SYohann// source code availability see http://github.com/ceed.
c532df63SYohann//
c532df63SYohann// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
c532df63SYohann// a collaborative effort of two U.S. Department of Energy organizations (Office
c532df63SYohann// of Science and the National Nuclear Security Administration) responsible for
c532df63SYohann// the planning and preparation of a capable exascale ecosystem, including
c532df63SYohann// software, applications, hardware, advanced system engineering and early
c532df63SYohann// testbed platforms, in support of the nation's exascale computing imperative.
c532df63SYohann
c532df63SYohann#include <ceed-backend.h>
c532df63SYohann#include <ceed.h>
c532df63SYohann#include "ceed-cuda-shared.h"
c532df63SYohann#include "../cuda/ceed-cuda.h"
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Shared mem kernels
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
cb0b5415Sjeremylt// *INDENT-OFF*
c532df63SYohannstatic const char *kernelsShared = QUOTE(
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Sum input into output
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void add(CeedScalar *r_V, const CeedScalar *r_U) {
c532df63SYohann  for (int i = 0; i < Q1D; i++)
c532df63SYohann    r_V[i] += r_U[i];
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 1D
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Read DoFs
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void readDofs1d(const int elem, const int tidx,
d94769d2SYohann Dudouit                                  const int tidy, const int tidz,const int comp,
7f823360Sjeremylt                                  const int nelem, const CeedScalar *d_U,
7f823360Sjeremylt                                  CeedScalar *slice) {
c532df63SYohann  for (int i = 0; i < P1D; i++)
d94769d2SYohann Dudouit    slice[i + tidz*Q1D] = d_U[i + comp*P1D + elem*BASIS_NCOMP*P1D];
c532df63SYohann  for (int i = P1D; i < Q1D; i++)
d94769d2SYohann Dudouit    slice[i + tidz*Q1D] = 0.0;
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Write DoFs
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void writeDofs1d(const int elem, const int tidx,
c532df63SYohann                                   const int tidy, const int comp,
288c0443SJeremy L Thompson                                   const int nelem, const CeedScalar &r_V,
288c0443SJeremy L Thompson                                   CeedScalar *d_V) {
*ab213215SJeremy L Thompson  if (tidx<P1D)
c532df63SYohann    d_V[tidx + comp*P1D + elem*BASIS_NCOMP*P1D] = r_V;
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Read quadrature point data
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void readQuads1d(const int elem, const int tidx,
d94769d2SYohann Dudouit                                   const int tidy, const int tidz, const int comp,
288c0443SJeremy L Thompson                                   const int dim, const int nelem,
288c0443SJeremy L Thompson                                   const CeedScalar *d_U, CeedScalar *slice) {
c532df63SYohann  for (int i = 0; i < Q1D; i++)
4d537eeaSYohann    slice[i + tidz*Q1D] = d_U[i + elem*Q1D + comp*Q1D*nelem +
4d537eeaSYohann                            dim*BASIS_NCOMP*nelem*Q1D];
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Write quadrature point data
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void writeQuads1d(const int elem, const int tidx,
c532df63SYohann                                    const int tidy, const int comp,
288c0443SJeremy L Thompson                                    const int dim, const int nelem,
288c0443SJeremy L Thompson                                    const CeedScalar &r_V, CeedScalar *d_V) {
c532df63SYohann  d_V[tidx + elem*Q1D + comp*Q1D*nelem + dim*BASIS_NCOMP*nelem*Q1D] = r_V;
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 1D tensor contraction
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void ContractX1d(CeedScalar *slice, const int tidx,
d94769d2SYohann Dudouit                                   const int tidy, const int tidz,
288c0443SJeremy L Thompson                                   const CeedScalar &U, const CeedScalar *B,
288c0443SJeremy L Thompson                                   CeedScalar &V) {
c532df63SYohann  V = 0.0;
*ab213215SJeremy L Thompson  for (int i = 0; i < P1D; ++i)
*ab213215SJeremy L Thompson    V += B[i + tidx*P1D] * slice[i + tidz*Q1D]; // Contract x direction
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 1D transpose tensor contraction
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void ContractTransposeX1d(CeedScalar *slice, const int tidx,
d94769d2SYohann Dudouit    const int tidy, const int tidz,
c532df63SYohann    const CeedScalar &U, const CeedScalar *B, CeedScalar &V) {
c532df63SYohann  V = 0.0;
*ab213215SJeremy L Thompson  for (int i = 0; i < Q1D; ++i)
*ab213215SJeremy L Thompson    V += B[tidx + i*P1D] * slice[i + tidz*Q1D]; // Contract x direction
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 1D interpolate to quadrature points
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void interp1d(const CeedInt nelem, const int transpose,
288c0443SJeremy L Thompson                                const CeedScalar *c_B,
288c0443SJeremy L Thompson                                const CeedScalar *__restrict__ d_U,
c532df63SYohann                                CeedScalar *__restrict__ d_V,
c532df63SYohann                                CeedScalar *slice) {
c532df63SYohann  CeedScalar r_V;
c532df63SYohann  CeedScalar r_t;
c532df63SYohann
c532df63SYohann  const int tidx = threadIdx.x;
c532df63SYohann  const int tidy = threadIdx.y;
d94769d2SYohann Dudouit  const int tidz = threadIdx.z;
c532df63SYohann
c532df63SYohann
c532df63SYohann  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
c532df63SYohann       elem += gridDim.x*blockDim.z) {
c532df63SYohann    for (int comp = 0; comp < BASIS_NCOMP; comp++) {
c532df63SYohann      if (!transpose) {
d94769d2SYohann Dudouit        readDofs1d(elem, tidx, tidy, tidz, comp, nelem, d_U, slice);
d94769d2SYohann Dudouit        ContractX1d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
c532df63SYohann        writeQuads1d(elem, tidx, tidy, comp, 0, nelem, r_V, d_V);
c532df63SYohann      } else {
d94769d2SYohann Dudouit        readQuads1d(elem, tidx, tidy, tidz, comp, 0, nelem, d_U, slice);
d94769d2SYohann Dudouit        ContractTransposeX1d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
c532df63SYohann        writeDofs1d(elem, tidx, tidy, comp, nelem, r_V, d_V);
c532df63SYohann      }
c532df63SYohann    }
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 1D derivatives at quadrature points
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void grad1d(const CeedInt nelem, const int transpose,
c532df63SYohann                              const CeedScalar *c_B, const CeedScalar *c_G,
288c0443SJeremy L Thompson                              const CeedScalar *__restrict__ d_U,
288c0443SJeremy L Thompson                              CeedScalar *__restrict__ d_V,
c532df63SYohann                              CeedScalar *slice) {
c532df63SYohann  CeedScalar r_U;
c532df63SYohann  CeedScalar r_V;
c532df63SYohann
c532df63SYohann  const int tidx = threadIdx.x;
d94769d2SYohann Dudouit  const int tidy = threadIdx.y;
d94769d2SYohann Dudouit  const int tidz = threadIdx.z;
c532df63SYohann  int dim;
c532df63SYohann
c532df63SYohann  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
c532df63SYohann       elem += gridDim.x*blockDim.z) {
c532df63SYohann    for(int comp = 0; comp < BASIS_NCOMP; comp++) {
c532df63SYohann      if (!transpose) {
d94769d2SYohann Dudouit        readDofs1d(elem, tidx, tidy, tidz, comp, nelem, d_U, slice);
d94769d2SYohann Dudouit        ContractX1d(slice, tidx, tidy, tidz, r_U, c_G, r_V);
c532df63SYohann        dim = 0;
c532df63SYohann        writeQuads1d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V);
c532df63SYohann      } else {
c532df63SYohann        dim = 0;
d94769d2SYohann Dudouit        readQuads1d(elem, tidx, tidy, tidz, comp, dim, nelem, d_U, slice);
d94769d2SYohann Dudouit        ContractTransposeX1d(slice, tidx, tidy, tidz, r_U, c_G, r_V);
c532df63SYohann        writeDofs1d(elem, tidx, tidy, comp, nelem, r_V, d_V);
c532df63SYohann      }
c532df63SYohann    }
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 1D Quadrature weights
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson__device__ void weight1d(const CeedInt nelem, const CeedScalar *qweight1d,
*ab213215SJeremy L Thompson                         CeedScalar *w) {
*ab213215SJeremy L Thompson  const int tid = threadIdx.x;
*ab213215SJeremy L Thompson  const CeedScalar weight = qweight1d[tid];
*ab213215SJeremy L Thompson  for (CeedInt elem = blockIdx.x*blockDim.y + threadIdx.y; elem < nelem;
*ab213215SJeremy L Thompson       elem += gridDim.x*blockDim.y) {
*ab213215SJeremy L Thompson    const int ind = elem*Q1D + tid;
*ab213215SJeremy L Thompson    w[ind] = weight;
*ab213215SJeremy L Thompson  }
*ab213215SJeremy L Thompson}
*ab213215SJeremy L Thompson
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 2D
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Read DoFs
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void readDofs2d(const int elem, const int tidx,
c532df63SYohann                                  const int tidy, const int comp,
288c0443SJeremy L Thompson                                  const int nelem, const CeedScalar *d_U,
288c0443SJeremy L Thompson                                  CeedScalar &U) {
c532df63SYohann  U = (tidx<P1D
7f823360Sjeremylt       && tidy<P1D) ? d_U[tidx + tidy*P1D + comp*P1D*P1D +
7f823360Sjeremylt                          elem*BASIS_NCOMP*P1D*P1D] :
c532df63SYohann      0.0;
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Write DoFs
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void writeDofs2d(const int elem, const int tidx,
c532df63SYohann                                   const int tidy, const int comp,
288c0443SJeremy L Thompson                                   const int nelem, const CeedScalar &r_V,
288c0443SJeremy L Thompson                                   CeedScalar *d_V) {
*ab213215SJeremy L Thompson  if (tidx<P1D && tidy<P1D)
c532df63SYohann    d_V[tidx + tidy*P1D + comp*P1D*P1D + elem*BASIS_NCOMP*P1D*P1D] = r_V;
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Read quadrature point data
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void readQuads2d(const int elem, const int tidx,
c532df63SYohann                                   const int tidy, const int comp,
288c0443SJeremy L Thompson                                   const int dim, const int nelem,
288c0443SJeremy L Thompson                                   const CeedScalar *d_U, CeedScalar &U ) {
c532df63SYohann  U = d_U[tidx + tidy*Q1D + elem*Q1D*Q1D + comp*Q1D*Q1D*nelem +
c532df63SYohann               dim*BASIS_NCOMP*nelem*Q1D*Q1D];
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Write quadrature point data
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void writeQuads2d(const int elem, const int tidx,
c532df63SYohann                                    const int tidy, const int comp,
288c0443SJeremy L Thompson                                    const int dim, const int nelem,
288c0443SJeremy L Thompson                                    const CeedScalar &r_V, CeedScalar *d_V) {
c532df63SYohann  d_V[tidx + tidy*Q1D + elem*Q1D*Q1D + comp*Q1D*Q1D*nelem +
c532df63SYohann           dim*BASIS_NCOMP*nelem*Q1D*Q1D] = r_V;
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 2D tensor contraction x
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void ContractX2d(CeedScalar *slice, const int tidx,
4247ecf3SYohann Dudouit                                   const int tidy, const int tidz,
288c0443SJeremy L Thompson                                   const CeedScalar &U, const CeedScalar *B,
288c0443SJeremy L Thompson                                   CeedScalar &V) {
4247ecf3SYohann Dudouit  slice[tidx + tidy*Q1D + tidz*Q1D*Q1D] = U;
c532df63SYohann  __syncthreads();
c532df63SYohann  V = 0.0;
*ab213215SJeremy L Thompson  for (int i = 0; i < P1D; ++i)
*ab213215SJeremy L Thompson    V += B[i + tidx*P1D] * slice[i + tidy*Q1D + tidz*Q1D*Q1D]; // Contract x direction
c532df63SYohann  __syncthreads();
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 2D tensor contraction y
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void ContractY2d(CeedScalar *slice, const int tidx,
4247ecf3SYohann Dudouit                                   const int tidy, const int tidz,
288c0443SJeremy L Thompson                                   const CeedScalar &U, const CeedScalar *B,
288c0443SJeremy L Thompson                                   CeedScalar &V) {
4247ecf3SYohann Dudouit  slice[tidx + tidy*Q1D + tidz*Q1D*Q1D] = U;
c532df63SYohann  __syncthreads();
c532df63SYohann  V = 0.0;
*ab213215SJeremy L Thompson  for (int i = 0; i < P1D; ++i)
*ab213215SJeremy L Thompson    V += B[i + tidy*P1D] * slice[tidx + i*Q1D + tidz*Q1D*Q1D]; // Contract y direction
c532df63SYohann  __syncthreads();
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 2D transpose tensor contraction y
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void ContractTransposeY2d(CeedScalar *slice, const int tidx,
4247ecf3SYohann Dudouit    const int tidy, const int tidz,
c532df63SYohann    const CeedScalar &U, const CeedScalar *B, CeedScalar &V) {
4247ecf3SYohann Dudouit  slice[tidx + tidy*Q1D + tidz*Q1D*Q1D] = U;
c532df63SYohann  __syncthreads();
c532df63SYohann  V = 0.0;
*ab213215SJeremy L Thompson  if (tidy < P1D)
*ab213215SJeremy L Thompson    for (int i = 0; i < Q1D; ++i)
*ab213215SJeremy L Thompson      V += B[tidy + i*P1D] * slice[tidx + i*Q1D + tidz*Q1D*Q1D]; // Contract y direction
c532df63SYohann  __syncthreads();
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 2D transpose tensor contraction x
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void ContractTransposeX2d(CeedScalar *slice, const int tidx,
4247ecf3SYohann Dudouit    const int tidy, const int tidz,
c532df63SYohann    const CeedScalar &U, const CeedScalar *B, CeedScalar &V) {
4247ecf3SYohann Dudouit  slice[tidx + tidy*Q1D + tidz*Q1D*Q1D] = U;
c532df63SYohann  __syncthreads();
c532df63SYohann  V = 0.0;
*ab213215SJeremy L Thompson  if (tidx < P1D)
*ab213215SJeremy L Thompson    for (int i = 0; i < Q1D; ++i)
*ab213215SJeremy L Thompson      V += B[tidx + i*P1D] * slice[i + tidy*Q1D + tidz*Q1D*Q1D]; // Contract x direction
c532df63SYohann  __syncthreads();
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 2D interpolate to quadrature points
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void interp2d(const CeedInt nelem, const int transpose,
288c0443SJeremy L Thompson                                const CeedScalar *c_B,
288c0443SJeremy L Thompson                                const CeedScalar *__restrict__ d_U,
c532df63SYohann                                CeedScalar *__restrict__ d_V,
c532df63SYohann                                CeedScalar *slice) {
c532df63SYohann  CeedScalar r_V;
c532df63SYohann  CeedScalar r_t;
c532df63SYohann
c532df63SYohann  const int tidx = threadIdx.x;
c532df63SYohann  const int tidy = threadIdx.y;
4247ecf3SYohann Dudouit  const int tidz = threadIdx.z;
4247ecf3SYohann Dudouit  const int blockElem = tidz/BASIS_NCOMP;
4247ecf3SYohann Dudouit  const int elemsPerBlock = blockDim.z/BASIS_NCOMP;
4247ecf3SYohann Dudouit  const int comp = tidz%BASIS_NCOMP;
c532df63SYohann
4247ecf3SYohann Dudouit  for (CeedInt elem = blockIdx.x*elemsPerBlock + blockElem; elem < nelem;
4247ecf3SYohann Dudouit       elem += gridDim.x*elemsPerBlock) {
4247ecf3SYohann Dudouit    const int comp = tidz%BASIS_NCOMP;
c532df63SYohann    r_V = 0.0;
c532df63SYohann    r_t = 0.0;
c532df63SYohann    if (!transpose) {
c532df63SYohann      readDofs2d(elem, tidx, tidy, comp, nelem, d_U, r_V);
4247ecf3SYohann Dudouit      ContractX2d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
4247ecf3SYohann Dudouit      ContractY2d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
c532df63SYohann      writeQuads2d(elem, tidx, tidy, comp, 0, nelem, r_V, d_V);
c532df63SYohann    } else {
c532df63SYohann      readQuads2d(elem, tidx, tidy, comp, 0, nelem, d_U, r_V);
4247ecf3SYohann Dudouit      ContractTransposeY2d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
4247ecf3SYohann Dudouit      ContractTransposeX2d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
c532df63SYohann      writeDofs2d(elem, tidx, tidy, comp, nelem, r_V, d_V);
c532df63SYohann    }
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 2D derivatives at quadrature points
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void grad2d(const CeedInt nelem, const int transpose,
c532df63SYohann                              const CeedScalar *c_B, const CeedScalar *c_G,
7f823360Sjeremylt                              const CeedScalar *__restrict__ d_U,
7f823360Sjeremylt                              CeedScalar *__restrict__ d_V, CeedScalar *slice) {
c532df63SYohann  CeedScalar r_U;
c532df63SYohann  CeedScalar r_V;
c532df63SYohann  CeedScalar r_t;
c532df63SYohann
c532df63SYohann  const int tidx = threadIdx.x;
c532df63SYohann  const int tidy = threadIdx.y;
4247ecf3SYohann Dudouit  const int tidz = threadIdx.z;
4247ecf3SYohann Dudouit  const int blockElem = tidz/BASIS_NCOMP;
4247ecf3SYohann Dudouit  const int elemsPerBlock = blockDim.z/BASIS_NCOMP;
4247ecf3SYohann Dudouit  const int comp = tidz%BASIS_NCOMP;
c532df63SYohann  int dim;
c532df63SYohann
4247ecf3SYohann Dudouit  for (CeedInt elem = blockIdx.x*elemsPerBlock + blockElem; elem < nelem;
4247ecf3SYohann Dudouit       elem += gridDim.x*elemsPerBlock) {
c532df63SYohann    if (!transpose) {
c532df63SYohann      readDofs2d(elem, tidx, tidy, comp, nelem, d_U, r_U);
4247ecf3SYohann Dudouit      ContractX2d(slice, tidx, tidy, tidz, r_U, c_G, r_t);
4247ecf3SYohann Dudouit      ContractY2d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
c532df63SYohann      dim = 0;
c532df63SYohann      writeQuads2d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V);
4247ecf3SYohann Dudouit      ContractX2d(slice, tidx, tidy, tidz, r_U, c_B, r_t);
4247ecf3SYohann Dudouit      ContractY2d(slice, tidx, tidy, tidz, r_t, c_G, r_V);
c532df63SYohann      dim = 1;
c532df63SYohann      writeQuads2d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V);
c532df63SYohann    } else {
c532df63SYohann      dim = 0;
c532df63SYohann      readQuads2d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U);
4247ecf3SYohann Dudouit      ContractTransposeY2d(slice, tidx, tidy, tidz, r_U, c_B, r_t);
4247ecf3SYohann Dudouit      ContractTransposeX2d(slice, tidx, tidy, tidz, r_t, c_G, r_V);
c532df63SYohann      dim = 1;
c532df63SYohann      readQuads2d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U);
4247ecf3SYohann Dudouit      ContractTransposeY2d(slice, tidx, tidy, tidz, r_U, c_G, r_t);
4247ecf3SYohann Dudouit      ContractTransposeX2d(slice, tidx, tidy, tidz, r_t, c_B, r_U);
c532df63SYohann      r_V += r_U;
c532df63SYohann      writeDofs2d(elem, tidx, tidy, comp, nelem, r_V, d_V);
c532df63SYohann    }
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 2D quadrature weights
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson__device__ void weight2d(const CeedInt nelem, const CeedScalar *qweight1d,
*ab213215SJeremy L Thompson                         CeedScalar *w) {
*ab213215SJeremy L Thompson  const int i = threadIdx.x;
*ab213215SJeremy L Thompson  const int j = threadIdx.y;
*ab213215SJeremy L Thompson  const CeedScalar weight = qweight1d[i]*qweight1d[j];
*ab213215SJeremy L Thompson  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem;
*ab213215SJeremy L Thompson       elem += gridDim.x*blockDim.z) {
*ab213215SJeremy L Thompson    const int ind = elem*Q1D*Q1D + i + j*Q1D;
*ab213215SJeremy L Thompson    w[ind] = weight;
*ab213215SJeremy L Thompson  }
*ab213215SJeremy L Thompson}
*ab213215SJeremy L Thompson
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 3D
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Read DoFs
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void readDofs3d(const int elem, const int tidx,
c532df63SYohann                                  const int tidy, const int comp,
7f823360Sjeremylt                                  const int nelem, const CeedScalar *d_U,
7f823360Sjeremylt                                  CeedScalar *r_U) {
c532df63SYohann  for (int i = 0; i < P1D; i++)
*ab213215SJeremy L Thompson    r_U[i] = (tidx < P1D && tidy < P1D) ?
*ab213215SJeremy L Thompson              d_U[tidx + tidy*P1D + i*P1D*P1D + comp*P1D*P1D*P1D +
c532df63SYohann                                      elem*BASIS_NCOMP*P1D*P1D*P1D] : 0.0;
c532df63SYohann  for (int i = P1D; i < Q1D; i++)
c532df63SYohann    r_U[i] = 0.0;
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Read quadrature point data
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void readQuads3d(const int elem, const int tidx,
c532df63SYohann                                   const int tidy, const int comp,
7f823360Sjeremylt                                   const int dim, const int nelem,
7f823360Sjeremylt                                   const CeedScalar *d_U, CeedScalar *r_U) {
c532df63SYohann  for (int i = 0; i < Q1D; i++)
c532df63SYohann    r_U[i] = d_U[tidx + tidy*Q1D + i*Q1D*Q1D + elem*Q1D*Q1D*Q1D +
c532df63SYohann                 comp*Q1D*Q1D*Q1D*nelem + dim*BASIS_NCOMP*nelem*Q1D*Q1D*Q1D];
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Write DoFs
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void writeDofs3d(const int elem, const int tidx,
c532df63SYohann                                   const int tidy, const int comp,
7f823360Sjeremylt                                   const int nelem, const CeedScalar *r_V,
7f823360Sjeremylt                                   CeedScalar *d_V) {
c532df63SYohann  if (tidx < P1D && tidy < P1D) {
c532df63SYohann    for (int i = 0; i < P1D; i++)
c532df63SYohann      d_V[tidx + tidy*P1D + i*P1D*P1D + comp*P1D*P1D*P1D +
c532df63SYohann          elem*BASIS_NCOMP*P1D*P1D*P1D] = r_V[i];
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Write quadrature point data
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void writeQuads3d(const int elem, const int tidx,
c532df63SYohann                                    const int tidy, const int comp,
7f823360Sjeremylt                                    const int dim, const int nelem,
7f823360Sjeremylt                                    const CeedScalar *r_V, CeedScalar *d_V) {
c532df63SYohann  for (int i = 0; i < Q1D; i++)
c532df63SYohann    d_V[tidx + tidy*Q1D + i*Q1D*Q1D + elem*Q1D*Q1D*Q1D + comp*Q1D*Q1D*Q1D*nelem +
c532df63SYohann        dim*BASIS_NCOMP*nelem*Q1D*Q1D*Q1D] = r_V[i];
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 3D tensor contract x
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void ContractX3d(CeedScalar *slice, const int tidx,
698ebc35SYohann Dudouit                                   const int tidy, const int tidz,
7f823360Sjeremylt                                   const CeedScalar *U, const CeedScalar *B,
7f823360Sjeremylt                                   CeedScalar *V) {
c532df63SYohann  for (int k = 0; k < P1D; ++k) {
698ebc35SYohann Dudouit    slice[tidx + tidy*Q1D + tidz*Q1D*Q1D] = U[k];
c532df63SYohann    __syncthreads();
c532df63SYohann    V[k] = 0.0;
*ab213215SJeremy L Thompson    for (int i = 0; i < P1D; ++i)
*ab213215SJeremy L Thompson      V[k] += B[i + tidx*P1D] * slice[i + tidy*Q1D + tidz*Q1D*Q1D]; // Contract x direction
c532df63SYohann    __syncthreads();
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 3D tensor contract y
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void ContractY3d(CeedScalar *slice, const int tidx,
698ebc35SYohann Dudouit                                   const int tidy, const int tidz,
7f823360Sjeremylt                                   const CeedScalar *U, const CeedScalar *B,
7f823360Sjeremylt                                   CeedScalar *V) {
c532df63SYohann  for (int k = 0; k < P1D; ++k) {
698ebc35SYohann Dudouit    slice[tidx + tidy*Q1D + tidz*Q1D*Q1D] = U[k];
c532df63SYohann    __syncthreads();
c532df63SYohann    V[k] = 0.0;
*ab213215SJeremy L Thompson    for (int i = 0; i < P1D; ++i)
*ab213215SJeremy L Thompson      V[k] += B[i + tidy*P1D] * slice[tidx + i*Q1D + tidz*Q1D*Q1D]; // Contract y direction
c532df63SYohann    __syncthreads();
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 3D tensor contract z
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void ContractZ3d(CeedScalar *slice, const int tidx,
698ebc35SYohann Dudouit                                   const int tidy, const int tidz,
7f823360Sjeremylt                                   const CeedScalar *U, const CeedScalar *B,
7f823360Sjeremylt                                   CeedScalar *V) {
c532df63SYohann  for (int k = 0; k < Q1D; ++k) {
c532df63SYohann    V[k] = 0.0;
*ab213215SJeremy L Thompson    for (int i = 0; i < P1D; ++i)
*ab213215SJeremy L Thompson      V[k] += B[i + k*P1D] * U[i]; // Contract z direction
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 3D transpose tensor contract z
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void ContractTransposeZ3d(CeedScalar *slice, const int tidx,
698ebc35SYohann Dudouit    const int tidy, const int tidz,
c532df63SYohann    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
c532df63SYohann  for (int k = 0; k < Q1D; ++k) {
c532df63SYohann    V[k] = 0.0;
*ab213215SJeremy L Thompson    if (k < P1D)
*ab213215SJeremy L Thompson      for (int i = 0; i < Q1D; ++i)
*ab213215SJeremy L Thompson        V[k] += B[k + i*P1D] * U[i]; // Contract z direction
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 3D transpose tensor contract y
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void ContractTransposeY3d(CeedScalar *slice, const int tidx,
698ebc35SYohann Dudouit    const int tidy, const int tidz,
c532df63SYohann    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
c532df63SYohann  for (int k = 0; k < P1D; ++k) {
698ebc35SYohann Dudouit    slice[tidx + tidy*Q1D + tidz*Q1D*Q1D] = U[k];
c532df63SYohann    __syncthreads();
c532df63SYohann    V[k] = 0.0;
*ab213215SJeremy L Thompson    if (tidy < P1D)
*ab213215SJeremy L Thompson      for (int i = 0; i < Q1D; ++i)
*ab213215SJeremy L Thompson        V[k] += B[tidy + i*P1D] * slice[tidx + i*Q1D + tidz*Q1D*Q1D]; // Contract y direction
c532df63SYohann    __syncthreads();
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 3D transpose tensor contract x
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void ContractTransposeX3d(CeedScalar *slice, const int tidx,
698ebc35SYohann Dudouit    const int tidy, const int tidz,
c532df63SYohann    const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
c532df63SYohann  for (int k = 0; k < P1D; ++k) {
698ebc35SYohann Dudouit    slice[tidx + tidy*Q1D + tidz*Q1D*Q1D] = U[k];
c532df63SYohann    __syncthreads();
c532df63SYohann    V[k] = 0.0;
*ab213215SJeremy L Thompson    if (tidx < P1D)
*ab213215SJeremy L Thompson      for (int i = 0; i < Q1D; ++i)
*ab213215SJeremy L Thompson        V[k] += B[tidx + i*P1D] * slice[i + tidy*Q1D + tidz*Q1D*Q1D]; // Contract x direction
c532df63SYohann    __syncthreads();
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 3D interpolate to quadrature points
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void interp3d(const CeedInt nelem, const int transpose,
7f823360Sjeremylt                                const CeedScalar *c_B,
7f823360Sjeremylt                                const CeedScalar *__restrict__ d_U,
c532df63SYohann                                CeedScalar *__restrict__ d_V,
c532df63SYohann                                CeedScalar *slice) {
c532df63SYohann  CeedScalar r_V[Q1D];
c532df63SYohann  CeedScalar r_t[Q1D];
c532df63SYohann
c532df63SYohann  const int tidx = threadIdx.x;
c532df63SYohann  const int tidy = threadIdx.y;
698ebc35SYohann Dudouit  const int tidz = threadIdx.z;
698ebc35SYohann Dudouit  const int blockElem = tidz/BASIS_NCOMP;
698ebc35SYohann Dudouit  const int elemsPerBlock = blockDim.z/BASIS_NCOMP;
698ebc35SYohann Dudouit  const int comp = tidz%BASIS_NCOMP;
c532df63SYohann
698ebc35SYohann Dudouit  for (CeedInt elem = blockIdx.x*elemsPerBlock + blockElem; elem < nelem;
698ebc35SYohann Dudouit       elem += gridDim.x*elemsPerBlock) {
c532df63SYohann    for (int i = 0; i < Q1D; ++i) {
c532df63SYohann      r_V[i] = 0.0;
c532df63SYohann      r_t[i] = 0.0;
c532df63SYohann    }
c532df63SYohann    if (!transpose) {
c532df63SYohann      readDofs3d(elem, tidx, tidy, comp, nelem, d_U, r_V);
698ebc35SYohann Dudouit      ContractX3d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
698ebc35SYohann Dudouit      ContractY3d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
698ebc35SYohann Dudouit      ContractZ3d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
c532df63SYohann      writeQuads3d(elem, tidx, tidy, comp, 0, nelem, r_t, d_V);
c532df63SYohann    } else {
c532df63SYohann      readQuads3d(elem, tidx, tidy, comp, 0, nelem, d_U, r_V);
698ebc35SYohann Dudouit      ContractTransposeZ3d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
698ebc35SYohann Dudouit      ContractTransposeY3d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
698ebc35SYohann Dudouit      ContractTransposeX3d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
c532df63SYohann      writeDofs3d(elem, tidx, tidy, comp, nelem, r_t, d_V);
c532df63SYohann    }
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 3D derivatives at quadrature points
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohanninline __device__ void grad3d(const CeedInt nelem, const int transpose,
c532df63SYohann                              const CeedScalar *c_B, const CeedScalar *c_G,
7f823360Sjeremylt                              const CeedScalar *__restrict__ d_U,
7f823360Sjeremylt                              CeedScalar *__restrict__ d_V,
c532df63SYohann                              CeedScalar *slice) {
*ab213215SJeremy L Thompson  // Use P1D for one of these
c532df63SYohann  CeedScalar r_U[Q1D];
c532df63SYohann  CeedScalar r_V[Q1D];
c532df63SYohann  CeedScalar r_t[Q1D];
c532df63SYohann
c532df63SYohann  const int tidx = threadIdx.x;
c532df63SYohann  const int tidy = threadIdx.y;
698ebc35SYohann Dudouit  const int tidz = threadIdx.z;
698ebc35SYohann Dudouit  const int blockElem = tidz/BASIS_NCOMP;
698ebc35SYohann Dudouit  const int elemsPerBlock = blockDim.z/BASIS_NCOMP;
698ebc35SYohann Dudouit  const int comp = tidz%BASIS_NCOMP;
c532df63SYohann  int dim;
c532df63SYohann
698ebc35SYohann Dudouit  for (CeedInt elem = blockIdx.x*elemsPerBlock + blockElem; elem < nelem;
698ebc35SYohann Dudouit       elem += gridDim.x*elemsPerBlock) {
c532df63SYohann    if (!transpose) {
c532df63SYohann      readDofs3d(elem, tidx, tidy, comp, nelem, d_U, r_U);
698ebc35SYohann Dudouit      ContractX3d(slice, tidx, tidy, tidz, r_U, c_G, r_V);
698ebc35SYohann Dudouit      ContractY3d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
698ebc35SYohann Dudouit      ContractZ3d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
c532df63SYohann      dim = 0;
c532df63SYohann      writeQuads3d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V);
698ebc35SYohann Dudouit      ContractX3d(slice, tidx, tidy, tidz, r_U, c_B, r_V);
698ebc35SYohann Dudouit      ContractY3d(slice, tidx, tidy, tidz, r_V, c_G, r_t);
698ebc35SYohann Dudouit      ContractZ3d(slice, tidx, tidy, tidz, r_t, c_B, r_V);
c532df63SYohann      dim = 1;
c532df63SYohann      writeQuads3d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V);
698ebc35SYohann Dudouit      ContractX3d(slice, tidx, tidy, tidz, r_U, c_B, r_V);
698ebc35SYohann Dudouit      ContractY3d(slice, tidx, tidy, tidz, r_V, c_B, r_t);
698ebc35SYohann Dudouit      ContractZ3d(slice, tidx, tidy, tidz, r_t, c_G, r_V);
c532df63SYohann      dim = 2;
c532df63SYohann      writeQuads3d(elem, tidx, tidy, comp, dim, nelem, r_V, d_V);
c532df63SYohann    } else {
c532df63SYohann      dim = 0;
c532df63SYohann      readQuads3d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U);
698ebc35SYohann Dudouit      ContractTransposeZ3d(slice, tidx, tidy, tidz, r_U, c_B, r_t);
698ebc35SYohann Dudouit      ContractTransposeY3d(slice, tidx, tidy, tidz, r_t, c_B, r_U);
698ebc35SYohann Dudouit      ContractTransposeX3d(slice, tidx, tidy, tidz, r_U, c_G, r_V);
c532df63SYohann      dim = 1;
c532df63SYohann      readQuads3d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U);
698ebc35SYohann Dudouit      ContractTransposeZ3d(slice, tidx, tidy, tidz, r_U, c_B, r_t);
698ebc35SYohann Dudouit      ContractTransposeY3d(slice, tidx, tidy, tidz, r_t, c_G, r_U);
698ebc35SYohann Dudouit      ContractTransposeX3d(slice, tidx, tidy, tidz, r_U, c_B, r_t);
c532df63SYohann      add(r_V, r_t);
c532df63SYohann      dim = 2;
c532df63SYohann      readQuads3d(elem, tidx, tidy, comp, dim, nelem, d_U, r_U);
698ebc35SYohann Dudouit      ContractTransposeZ3d(slice, tidx, tidy, tidz, r_U, c_G, r_t);
698ebc35SYohann Dudouit      ContractTransposeY3d(slice, tidx, tidy, tidz, r_t, c_B, r_U);
698ebc35SYohann Dudouit      ContractTransposeX3d(slice, tidx, tidy, tidz, r_U, c_B, r_t);
c532df63SYohann      add(r_V, r_t);
c532df63SYohann      writeDofs3d(elem, tidx, tidy, comp, nelem, r_V, d_V);
c532df63SYohann    }
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// 3D quadrature weights
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson__device__ void weight3d(const CeedInt nelem, const CeedScalar *qweight1d,
*ab213215SJeremy L Thompson                         CeedScalar *w) {
*ab213215SJeremy L Thompson  const int i = threadIdx.x;
*ab213215SJeremy L Thompson  const int j = threadIdx.y;
*ab213215SJeremy L Thompson  const int k = threadIdx.z;
*ab213215SJeremy L Thompson  const CeedScalar weight = qweight1d[i]*qweight1d[j]*qweight1d[k];
*ab213215SJeremy L Thompson  for (int e = blockIdx.x; e < nelem; e += gridDim.x) {
*ab213215SJeremy L Thompson    const int ind = e*Q1D*Q1D*Q1D + i + j*Q1D + k*Q1D*Q1D;
*ab213215SJeremy L Thompson    w[ind] = weight;
*ab213215SJeremy L Thompson  }
*ab213215SJeremy L Thompson}
*ab213215SJeremy L Thompson
*ab213215SJeremy L Thompson
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Basis kernels
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Interp kernel by dim
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohannextern "C" __global__ void interp(const CeedInt nelem, const int transpose,
7f823360Sjeremylt                                  const CeedScalar *c_B,
7f823360Sjeremylt                                  const CeedScalar *__restrict__ d_U,
c532df63SYohann                                  CeedScalar *__restrict__ d_V) {
074be161SYohann Dudouit  extern __shared__ double slice[];
c532df63SYohann  if (BASIS_DIM == 1) {
c532df63SYohann    interp1d(nelem, transpose, c_B, d_U, d_V, slice);
c532df63SYohann  } else if (BASIS_DIM == 2) {
c532df63SYohann    interp2d(nelem, transpose, c_B, d_U, d_V, slice);
c532df63SYohann  } else if (BASIS_DIM == 3) {
c532df63SYohann    interp3d(nelem, transpose, c_B, d_U, d_V, slice);
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Grad kernel by dim
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohannextern "C" __global__ void grad(const CeedInt nelem, const int transpose,
c532df63SYohann                                const CeedScalar *c_B, const CeedScalar *c_G,
7f823360Sjeremylt                                const CeedScalar *__restrict__ d_U,
7f823360Sjeremylt                                CeedScalar *__restrict__ d_V) {
074be161SYohann Dudouit  extern __shared__ double slice[];
c532df63SYohann  if (BASIS_DIM == 1) {
c532df63SYohann    grad1d(nelem, transpose, c_B, c_G, d_U, d_V, slice);
c532df63SYohann  } else if (BASIS_DIM == 2) {
c532df63SYohann    grad2d(nelem, transpose, c_B, c_G, d_U, d_V, slice);
c532df63SYohann  } else if (BASIS_DIM == 3) {
c532df63SYohann    grad3d(nelem, transpose, c_B, c_G, d_U, d_V, slice);
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Weight kernels by dim
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohannextern "C" __global__ void weight(const CeedInt nelem,
7f823360Sjeremylt                                  const CeedScalar *__restrict__ qweight1d,
7f823360Sjeremylt                                  CeedScalar *__restrict__ v) {
c532df63SYohann  if (BASIS_DIM == 1) {
c532df63SYohann    weight1d(nelem, qweight1d, v);
c532df63SYohann  } else if (BASIS_DIM == 2) {
c532df63SYohann    weight2d(nelem, qweight1d, v);
c532df63SYohann  } else if (BASIS_DIM == 3) {
c532df63SYohann    weight3d(nelem, qweight1d, v);
c532df63SYohann  }
c532df63SYohann}
c532df63SYohann
c532df63SYohann);
cb0b5415Sjeremylt// *INDENT-ON*
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Device initalization
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohannint CeedCudaInitInterp(CeedScalar *d_B, CeedInt P1d, CeedInt Q1d,
c532df63SYohann                       CeedScalar **c_B);
c532df63SYohannint CeedCudaInitInterpGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P1d,
7f823360Sjeremylt                           CeedInt Q1d, CeedScalar **c_B_ptr,
7f823360Sjeremylt                           CeedScalar **c_G_ptr);
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Apply basis
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohannint CeedBasisApplyTensor_Cuda_shared(CeedBasis basis, const CeedInt nelem,
c532df63SYohann                                     CeedTransposeMode tmode,
7f823360Sjeremylt                                     CeedEvalMode emode, CeedVector u,
7f823360Sjeremylt                                     CeedVector v) {
c532df63SYohann  int ierr;
c532df63SYohann  Ceed ceed;
c532df63SYohann  ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr);
c532df63SYohann  Ceed_Cuda_shared *ceed_Cuda;
c532df63SYohann  CeedGetData(ceed, (void *) &ceed_Cuda); CeedChk(ierr);
c532df63SYohann  CeedBasis_Cuda_shared *data;
c532df63SYohann  CeedBasisGetData(basis, (void *)&data); CeedChk(ierr);
c532df63SYohann  const CeedInt transpose = tmode == CEED_TRANSPOSE;
4247ecf3SYohann Dudouit  CeedInt dim, ncomp;
074be161SYohann Dudouit  ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr);
4247ecf3SYohann Dudouit  ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChk(ierr);
c532df63SYohann
*ab213215SJeremy L Thompson  // Read vectors
c532df63SYohann  const CeedScalar *d_u;
c532df63SYohann  CeedScalar *d_v;
c532df63SYohann  if (emode != CEED_EVAL_WEIGHT) {
c532df63SYohann    ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChk(ierr);
c532df63SYohann  }
c532df63SYohann  ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &d_v); CeedChk(ierr);
c532df63SYohann
*ab213215SJeremy L Thompson  // Clear v for transpose mode
c532df63SYohann  if (tmode == CEED_TRANSPOSE) {
c532df63SYohann    CeedInt length;
c532df63SYohann    ierr = CeedVectorGetLength(v, &length); CeedChk(ierr);
c532df63SYohann    ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); CeedChk(ierr);
c532df63SYohann  }
*ab213215SJeremy L Thompson
*ab213215SJeremy L Thompson  // Apply basis operation
*ab213215SJeremy L Thompson  switch (emode) {
*ab213215SJeremy L Thompson  case CEED_EVAL_INTERP: {
c532df63SYohann    CeedInt P1d, Q1d;
c532df63SYohann    ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
c532df63SYohann    ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
c532df63SYohann    ierr = CeedCudaInitInterp(data->d_interp1d, P1d, Q1d, &data->c_B);
c532df63SYohann    CeedChk(ierr);
cb0b5415Sjeremylt    void *interpargs[] = {(void *) &nelem, (void *) &transpose, &data->c_B,
ccf0fe6fSjeremylt                          &d_u, &d_v
ccf0fe6fSjeremylt                         };
4d537eeaSYohann    if (dim == 1) {
d94769d2SYohann Dudouit      CeedInt elemsPerBlock = 32;
4d537eeaSYohann      CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
4d537eeaSYohann                                             ? 1 : 0 );
d94769d2SYohann Dudouit      CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar);
4d537eeaSYohann      ierr = CeedRunKernelDimSharedCuda(ceed, data->interp, grid, Q1d, 1,
4d537eeaSYohann                                        elemsPerBlock, sharedMem,
*ab213215SJeremy L Thompson                                        interpargs); CeedChk(ierr);
074be161SYohann Dudouit    } else if (dim == 2) {
4247ecf3SYohann Dudouit      const CeedInt optElems[7] = {0,32,8,6,4,2,8};
4247ecf3SYohann Dudouit      CeedInt elemsPerBlock = Q1d < 7 ? optElems[Q1d]/ncomp : 1;
4d537eeaSYohann      CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
4d537eeaSYohann                                             ? 1 : 0 );
4247ecf3SYohann Dudouit      CeedInt sharedMem = ncomp*elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar);
4d537eeaSYohann      ierr = CeedRunKernelDimSharedCuda(ceed, data->interp, grid, Q1d, Q1d,
4d537eeaSYohann                                        ncomp*elemsPerBlock, sharedMem,
*ab213215SJeremy L Thompson                                        interpargs); CeedChk(ierr);
074be161SYohann Dudouit    } else if (dim == 3) {
3f63d318SYohann Dudouit      CeedInt elemsPerBlock = 1;
4d537eeaSYohann      CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
4d537eeaSYohann                                             ? 1 : 0 );
698ebc35SYohann Dudouit      CeedInt sharedMem = ncomp*elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar);
4d537eeaSYohann      ierr = CeedRunKernelDimSharedCuda(ceed, data->interp, grid, Q1d, Q1d,
4d537eeaSYohann                                        ncomp*elemsPerBlock, sharedMem,
*ab213215SJeremy L Thompson                                        interpargs); CeedChk(ierr);
074be161SYohann Dudouit    }
*ab213215SJeremy L Thompson  } break;
*ab213215SJeremy L Thompson  case CEED_EVAL_GRAD: {
c532df63SYohann    CeedInt P1d, Q1d;
c532df63SYohann    ierr = CeedBasisGetNumNodes1D(basis, &P1d); CeedChk(ierr);
c532df63SYohann    ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
c532df63SYohann    ierr = CeedCudaInitInterpGrad(data->d_interp1d, data->d_grad1d, P1d,
c532df63SYohann                                  Q1d, &data->c_B, &data->c_G);
c532df63SYohann    CeedChk(ierr);
cb0b5415Sjeremylt    void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->c_B,
ccf0fe6fSjeremylt                        &data->c_G, &d_u, &d_v
ccf0fe6fSjeremylt                       };
4d537eeaSYohann    if (dim == 1) {
d94769d2SYohann Dudouit      CeedInt elemsPerBlock = 32;
4d537eeaSYohann      CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
4d537eeaSYohann                                             ? 1 : 0 );
d94769d2SYohann Dudouit      CeedInt sharedMem = elemsPerBlock*Q1d*sizeof(CeedScalar);
*ab213215SJeremy L Thompson      ierr = CeedRunKernelDimSharedCuda(ceed, data->grad, grid, Q1d, 1,
*ab213215SJeremy L Thompson                                        elemsPerBlock, sharedMem, gradargs);
c532df63SYohann      CeedChk(ierr);
074be161SYohann Dudouit    } else if (dim == 2) {
4247ecf3SYohann Dudouit      const CeedInt optElems[7] = {0,32,8,6,4,2,8};
4247ecf3SYohann Dudouit      CeedInt elemsPerBlock = Q1d < 7 ? optElems[Q1d]/ncomp : 1;
4d537eeaSYohann      CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
4d537eeaSYohann                                             ? 1 : 0 );
4247ecf3SYohann Dudouit      CeedInt sharedMem = ncomp*elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar);
4d537eeaSYohann      ierr = CeedRunKernelDimSharedCuda(ceed, data->grad, grid, Q1d, Q1d,
4d537eeaSYohann                                        ncomp*elemsPerBlock, sharedMem,
*ab213215SJeremy L Thompson                                        gradargs); CeedChk(ierr);
074be161SYohann Dudouit    } else if (dim == 3) {
3f63d318SYohann Dudouit      CeedInt elemsPerBlock = 1;
4d537eeaSYohann      CeedInt grid = nelem/elemsPerBlock + ( (nelem/elemsPerBlock*elemsPerBlock<nelem)
4d537eeaSYohann                                             ? 1 : 0 );
698ebc35SYohann Dudouit      CeedInt sharedMem = ncomp*elemsPerBlock*Q1d*Q1d*sizeof(CeedScalar);
4d537eeaSYohann      ierr = CeedRunKernelDimSharedCuda(ceed, data->grad, grid, Q1d, Q1d,
4d537eeaSYohann                                        ncomp*elemsPerBlock, sharedMem,
*ab213215SJeremy L Thompson                                        gradargs); CeedChk(ierr);
074be161SYohann Dudouit    }
*ab213215SJeremy L Thompson  } break;
*ab213215SJeremy L Thompson  case CEED_EVAL_WEIGHT: {
074be161SYohann Dudouit    CeedInt Q1d;
074be161SYohann Dudouit    ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr);
c532df63SYohann    void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight1d, &d_v};
074be161SYohann Dudouit    if (dim == 1) {
074be161SYohann Dudouit      const CeedInt elemsPerBlock = 32/Q1d;
4d537eeaSYohann      const CeedInt gridsize = nelem/elemsPerBlock + ( (
4d537eeaSYohann                                 nelem/elemsPerBlock*elemsPerBlock<nelem)? 1 : 0 );
7f823360Sjeremylt      ierr = CeedRunKernelDimCuda(ceed, data->weight, gridsize, Q1d,
7f823360Sjeremylt                                  elemsPerBlock, 1, weightargs);
1226057fSYohann Dudouit      CeedChk(ierr);
074be161SYohann Dudouit    } else if (dim == 2) {
717ff8a3SYohann Dudouit      const CeedInt optElems = 32/(Q1d*Q1d);
717ff8a3SYohann Dudouit      const CeedInt elemsPerBlock = optElems>0?optElems:1;
4d537eeaSYohann      const CeedInt gridsize = nelem/elemsPerBlock + ( (
4d537eeaSYohann                                 nelem/elemsPerBlock*elemsPerBlock<nelem)? 1 : 0 );
4d537eeaSYohann      ierr = CeedRunKernelDimCuda(ceed, data->weight, gridsize, Q1d, Q1d,
4d537eeaSYohann                                  elemsPerBlock, weightargs);
1226057fSYohann Dudouit      CeedChk(ierr);
074be161SYohann Dudouit    } else if (dim == 3) {
074be161SYohann Dudouit      const CeedInt gridsize = nelem;
4d537eeaSYohann      ierr = CeedRunKernelDimCuda(ceed, data->weight, gridsize, Q1d, Q1d, Q1d,
4d537eeaSYohann                                  weightargs);
1226057fSYohann Dudouit      CeedChk(ierr);
074be161SYohann Dudouit    }
*ab213215SJeremy L Thompson  } break;
*ab213215SJeremy L Thompson  // LCOV_EXCL_START
*ab213215SJeremy L Thompson  // Evaluate the divergence to/from the quadrature points
*ab213215SJeremy L Thompson  case CEED_EVAL_DIV:
*ab213215SJeremy L Thompson    return CeedError(ceed, 1, "CEED_EVAL_DIV not supported");
*ab213215SJeremy L Thompson  // Evaluate the curl to/from the quadrature points
*ab213215SJeremy L Thompson  case CEED_EVAL_CURL:
*ab213215SJeremy L Thompson    return CeedError(ceed, 1, "CEED_EVAL_CURL not supported");
*ab213215SJeremy L Thompson  // Take no action, BasisApply should not have been called
*ab213215SJeremy L Thompson  case CEED_EVAL_NONE:
*ab213215SJeremy L Thompson    return CeedError(ceed, 1,
*ab213215SJeremy L Thompson                     "CEED_EVAL_NONE does not make sense in this context");
*ab213215SJeremy L Thompson    // LCOV_EXCL_STOP
c532df63SYohann  }
c532df63SYohann
*ab213215SJeremy L Thompson  // Restore vectors
c532df63SYohann  if (emode != CEED_EVAL_WEIGHT) {
c532df63SYohann    ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChk(ierr);
c532df63SYohann  }
c532df63SYohann  ierr = CeedVectorRestoreArray(v, &d_v); CeedChk(ierr);
c532df63SYohann  return 0;
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Destroy basis
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohannstatic int CeedBasisDestroy_Cuda_shared(CeedBasis basis) {
c532df63SYohann  int ierr;
c532df63SYohann  Ceed ceed;
c532df63SYohann  ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr);
c532df63SYohann
c532df63SYohann  CeedBasis_Cuda_shared *data;
c532df63SYohann  ierr = CeedBasisGetData(basis, (void *) &data); CeedChk(ierr);
c532df63SYohann
c532df63SYohann  CeedChk_Cu(ceed, cuModuleUnload(data->module));
c532df63SYohann
c532df63SYohann  ierr = cudaFree(data->d_qweight1d); CeedChk_Cu(ceed, ierr);
c532df63SYohann  ierr = cudaFree(data->d_interp1d); CeedChk_Cu(ceed, ierr);
c532df63SYohann  ierr = cudaFree(data->d_grad1d); CeedChk_Cu(ceed, ierr);
c532df63SYohann
c532df63SYohann  ierr = CeedFree(&data); CeedChk(ierr);
c532df63SYohann
c532df63SYohann  return 0;
c532df63SYohann}
c532df63SYohann
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
*ab213215SJeremy L Thompson// Create tensor basis
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------
c532df63SYohannint CeedBasisCreateTensorH1_Cuda_shared(CeedInt dim, CeedInt P1d, CeedInt Q1d,
c532df63SYohann                                        const CeedScalar *interp1d,
c532df63SYohann                                        const CeedScalar *grad1d,
c532df63SYohann                                        const CeedScalar *qref1d,
c532df63SYohann                                        const CeedScalar *qweight1d,
c532df63SYohann                                        CeedBasis basis) {
c532df63SYohann  int ierr;
c532df63SYohann  Ceed ceed;
c532df63SYohann  ierr = CeedBasisGetCeed(basis, &ceed); CeedChk(ierr);
4d537eeaSYohann  if (Q1d<P1d) {
1226057fSYohann Dudouit    return CeedError(ceed, 1, "Backend does not implement underintegrated basis.");
1226057fSYohann Dudouit  }
c532df63SYohann  CeedBasis_Cuda_shared *data;
c532df63SYohann  ierr = CeedCalloc(1, &data); CeedChk(ierr);
c532df63SYohann
*ab213215SJeremy L Thompson  // Copy basis data to GPU
c532df63SYohann  const CeedInt qBytes = Q1d * sizeof(CeedScalar);
c532df63SYohann  ierr = cudaMalloc((void **)&data->d_qweight1d, qBytes); CeedChk_Cu(ceed, ierr);
c532df63SYohann  ierr = cudaMemcpy(data->d_qweight1d, qweight1d, qBytes,
c532df63SYohann                    cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
c532df63SYohann
c532df63SYohann  const CeedInt iBytes = qBytes * P1d;
c532df63SYohann  ierr = cudaMalloc((void **)&data->d_interp1d, iBytes); CeedChk_Cu(ceed, ierr);
c532df63SYohann  ierr = cudaMemcpy(data->d_interp1d, interp1d, iBytes,
c532df63SYohann                    cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
c532df63SYohann
c532df63SYohann  ierr = cudaMalloc((void **)&data->d_grad1d, iBytes); CeedChk_Cu(ceed, ierr);
c532df63SYohann  ierr = cudaMemcpy(data->d_grad1d, grad1d, iBytes,
c532df63SYohann                    cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
c532df63SYohann
*ab213215SJeremy L Thompson  // Compute collocated gradient and copy to GPU
ac421f39SYohann  data->d_collograd1d = NULL;
ac421f39SYohann  if (dim == 3 && Q1d >= P1d) {
ac421f39SYohann    CeedScalar *collograd1d;
ac421f39SYohann    ierr = CeedMalloc(Q1d*Q1d, &collograd1d); CeedChk(ierr);
ac421f39SYohann    ierr = CeedBasisGetCollocatedGrad(basis, collograd1d); CeedChk(ierr);
ac421f39SYohann    ierr = cudaMalloc((void **)&data->d_collograd1d, qBytes * Q1d);
ac421f39SYohann    CeedChk_Cu(ceed, ierr);
ac421f39SYohann    ierr = cudaMemcpy(data->d_collograd1d, collograd1d, qBytes * Q1d,
ac421f39SYohann                      cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr);
ac421f39SYohann  }
ac421f39SYohann
*ab213215SJeremy L Thompson  // Compile basis kernels
c532df63SYohann  CeedInt ncomp;
c532df63SYohann  ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChk(ierr);
4a6d4bbdSYohann Dudouit  ierr = CeedCompileCuda(ceed, kernelsShared, &data->module, 7,
c532df63SYohann                         "Q1D", Q1d,
c532df63SYohann                         "P1D", P1d,
c532df63SYohann                         "BASIS_BUF_LEN", ncomp * CeedIntPow(Q1d > P1d ?
c532df63SYohann                             Q1d : P1d, dim),
c532df63SYohann                         "BASIS_DIM", dim,
c532df63SYohann                         "BASIS_NCOMP", ncomp,
c532df63SYohann                         "BASIS_ELEMSIZE", CeedIntPow(P1d, dim),
c532df63SYohann                         "BASIS_NQPT", CeedIntPow(Q1d, dim)
c532df63SYohann                        ); CeedChk(ierr);
4a6d4bbdSYohann Dudouit  ierr = CeedGetKernelCuda(ceed, data->module, "interp", &data->interp);
c532df63SYohann  CeedChk(ierr);
4a6d4bbdSYohann Dudouit  ierr = CeedGetKernelCuda(ceed, data->module, "grad", &data->grad);
c532df63SYohann  CeedChk(ierr);
4a6d4bbdSYohann Dudouit  ierr = CeedGetKernelCuda(ceed, data->module, "weight", &data->weight);
c532df63SYohann  CeedChk(ierr);
c532df63SYohann
*ab213215SJeremy L Thompson  ierr = CeedBasisSetData(basis, (void *)&data); CeedChk(ierr);
*ab213215SJeremy L Thompson
*ab213215SJeremy L Thompson  // Register backend functions
c532df63SYohann  ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply",
c532df63SYohann                                CeedBasisApplyTensor_Cuda_shared);
c532df63SYohann  CeedChk(ierr);
c532df63SYohann  ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy",
*ab213215SJeremy L Thompson                                CeedBasisDestroy_Cuda_shared); CeedChk(ierr);
c532df63SYohann  return 0;
c532df63SYohann}
*ab213215SJeremy L Thompson//------------------------------------------------------------------------------