// Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. // All Rights reserved. See files LICENSE and NOTICE for details. // // This file is part of CEED, a collection of benchmarks, miniapps, software // libraries and APIs for efficient high-order finite element and spectral // element discretizations for exascale applications. For more information and // source code availability see http://github.com/ceed. // // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, // a collaborative effort of two U.S. Department of Energy organizations (Office // of Science and the National Nuclear Security Administration) responsible for // the planning and preparation of a capable exascale ecosystem, including // software, applications, hardware, advanced system engineering and early // testbed platforms, in support of the nation's exascale computing imperative. #define CEED_DEBUG_COLOR 12 #include "ceed-cuda-gen.h" #include #include #include "../cuda-reg/ceed-cuda-reg.h" #include "../cuda-shared/ceed-cuda-shared.h" static const char *atomicAdd = QUOTE( //------------------------------------------------------------------------------ // Atomic add, for older CUDA //------------------------------------------------------------------------------ __device__ double atomicAdd(double *address, double val) { unsigned long long int *address_as_ull = (unsigned long long int *)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN // (since NaN != NaN) } while (assumed != old); return __longlong_as_double(old); } ); static const char *deviceFunctions = QUOTE( //------------------------------------------------------------------------------ // Typedefs //------------------------------------------------------------------------------ typedef struct { const CeedScalar* in[16]; CeedScalar* out[16]; } CudaFields; typedef struct { CeedInt* in[16]; CeedInt* out[16]; } CudaFieldsInt; typedef struct { CeedInt tidx; CeedInt tidy; CeedInt tidz; CeedInt tid; CeedScalar* slice; } BackendData; //------------------------------------------------------------------------------ // Load matrices for basis actions //------------------------------------------------------------------------------ template inline __device__ void loadMatrix(BackendData& data, const CeedScalar* d_B, CeedScalar* B) { for (CeedInt i = data.tid; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) B[i] = d_B[i]; } //------------------------------------------------------------------------------ // 1D //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ template inline __device__ void readDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx < P1d) { const CeedInt node = data.tidx; const CeedInt ind = indices[node + elem * P1d]; for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; } } //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ template inline __device__ void readDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx < P1d) { const CeedInt node = data.tidx; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; } } //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ template inline __device__ void writeDofsOffset1d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx < P1d) { const CeedInt node = data.tidx; const CeedInt ind = indices[node + elem * P1d]; for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); } } //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ template inline __device__ void writeDofsStrided1d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx < P1d) { const CeedInt node = data.tidx; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; } } //------------------------------------------------------------------------------ // 1D tensor contraction x //------------------------------------------------------------------------------ template inline __device__ void ContractX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx] = *U; __syncthreads(); *V = 0.0; for (CeedInt i = 0; i < P1d; ++i) *V += B[i + data.tidx*P1d] * data.slice[i]; // Contract x direction __syncthreads(); } //------------------------------------------------------------------------------ // 1D transpose tensor contraction x //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeX1d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx] = *U; __syncthreads(); *V = 0.0; for (CeedInt i = 0; i < Q1d; ++i) *V += B[data.tidx + i*P1d] * data.slice[i]; // Contract x direction __syncthreads(); } //------------------------------------------------------------------------------ // 1D interpolate to quadrature points //------------------------------------------------------------------------------ template inline __device__ void interp1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NCOMP; comp++) ContractX1d(data, r_U + comp, c_B, r_V + comp); } //------------------------------------------------------------------------------ // 1D interpolate transpose //------------------------------------------------------------------------------ template inline __device__ void interpTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { for (CeedInt comp=0; comp(data, r_U + comp, c_B, r_V + comp); } //------------------------------------------------------------------------------ // 1D derivatives at quadrature points //------------------------------------------------------------------------------ template inline __device__ void grad1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NCOMP; comp++) ContractX1d(data, r_U + comp, c_G, r_V + comp); } //------------------------------------------------------------------------------ // 1D derivatives transpose //------------------------------------------------------------------------------ template inline __device__ void gradTranspose1d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NCOMP; comp++) ContractTransposeX1d(data, r_U + comp, c_G, r_V + comp); } //------------------------------------------------------------------------------ // 2D //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ template inline __device__ void readDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx < P1d && data.tidy < P1d) { const CeedInt node = data.tidx + data.tidy*P1d; const CeedInt ind = indices[node + elem * P1d*P1d]; for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; } } //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ template inline __device__ void readDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx < P1d && data.tidy < P1d) { const CeedInt node = data.tidx + data.tidy*P1d; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; } } //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ template inline __device__ void writeDofsOffset2d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx < P1d && data.tidy < P1d) { const CeedInt node = data.tidx + data.tidy*P1d; const CeedInt ind = indices[node + elem * P1d*P1d]; for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); } } //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ template inline __device__ void writeDofsStrided2d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx < P1d && data.tidy < P1d) { const CeedInt node = data.tidx + data.tidy*P1d; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; } } //------------------------------------------------------------------------------ // 2D tensor contraction x //------------------------------------------------------------------------------ template inline __device__ void ContractX2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx+data.tidy*Q1d] = *U; __syncthreads(); *V = 0.0; for (CeedInt i = 0; i < P1d; ++i) *V += B[i + data.tidx*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction __syncthreads(); } //------------------------------------------------------------------------------ // 2D tensor contract y //------------------------------------------------------------------------------ template inline __device__ void ContractY2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx+data.tidy*Q1d] = *U; __syncthreads(); *V = 0.0; for (CeedInt i = 0; i < P1d; ++i) *V += B[i + data.tidy*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction __syncthreads(); } //------------------------------------------------------------------------------ // 2D transpose tensor contract y //------------------------------------------------------------------------------ template inline __device__ void ContractYTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx+data.tidy*Q1d] = *U; __syncthreads(); *V = 0.0; if (data.tidy < P1d) for (CeedInt i = 0; i < Q1d; ++i) *V += B[data.tidy + i*P1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction __syncthreads(); } //------------------------------------------------------------------------------ // 2D transpose tensor contract x //------------------------------------------------------------------------------ template inline __device__ void ContractXTranspose2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx+data.tidy*Q1d] = *U; __syncthreads(); *V = 0.0; if (data.tidx < P1d) for (CeedInt i = 0; i < Q1d; ++i) *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction __syncthreads(); } //------------------------------------------------------------------------------ // 2D transpose tensor contract and add x //------------------------------------------------------------------------------ template inline __device__ void ContractXTransposeAdd2d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { data.slice[data.tidx+data.tidy*Q1d] = *U; __syncthreads(); if (data.tidx < P1d) for (CeedInt i = 0; i < Q1d; ++i) *V += B[data.tidx + i*P1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction __syncthreads(); } //------------------------------------------------------------------------------ // 2D interpolate to quadrature points //------------------------------------------------------------------------------ template inline __device__ void interp2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NCOMP; comp++) { ContractX2d(data, r_U + comp, c_B, r_t); ContractY2d(data, r_t, c_B, r_V + comp); } } //------------------------------------------------------------------------------ // 2D interpolate transpose //------------------------------------------------------------------------------ template inline __device__ void interpTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NCOMP; comp++) { ContractYTranspose2d(data, r_U + comp, c_B, r_t); ContractXTranspose2d(data, r_t, c_B, r_V + comp); } } //------------------------------------------------------------------------------ // 2D derivatives at quadrature points //------------------------------------------------------------------------------ template inline __device__ void grad2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NCOMP; comp++) { ContractX2d(data, r_U + comp, c_G, r_t); ContractY2d(data, r_t, c_B, r_V + comp + 0*NCOMP); ContractX2d(data, r_U + comp, c_B, r_t); ContractY2d(data, r_t, c_G, r_V + comp + 1*NCOMP); } } //------------------------------------------------------------------------------ // 2D derivatives transpose //------------------------------------------------------------------------------ template inline __device__ void gradTranspose2d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t[1]; for (CeedInt comp = 0; comp < NCOMP; comp++) { ContractYTranspose2d(data, r_U + comp + 0*NCOMP, c_B, r_t); ContractXTranspose2d(data, r_t, c_G, r_V + comp); ContractYTranspose2d(data, r_U + comp + 1*NCOMP, c_G, r_t); ContractXTransposeAdd2d(data, r_t, c_B, r_V + comp); } } //------------------------------------------------------------------------------ // 3D //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ // L-vector -> E-vector, offsets provided //------------------------------------------------------------------------------ template inline __device__ void readDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx < P1d && data.tidy < P1d) for (CeedInt z = 0; z < P1d; ++z) { const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; } } //------------------------------------------------------------------------------ // L-vector -> E-vector, strided //------------------------------------------------------------------------------ template inline __device__ void readDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { if (data.tidx < P1d && data.tidy < P1d) for (CeedInt z = 0; z < P1d; ++z) { const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; } } //------------------------------------------------------------------------------ // E-vector -> Q-vector, offests provided //------------------------------------------------------------------------------ template inline __device__ void readSliceQuadsOffset3d(BackendData& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; } //------------------------------------------------------------------------------ // E-vector -> Q-vector, strided //------------------------------------------------------------------------------ template inline __device__ void readSliceQuadsStrided3d(BackendData& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { const CeedInt node = data.tidx + data.tidy*Q1d + q*Q1d*Q1d; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; } //------------------------------------------------------------------------------ // E-vector -> L-vector, offsets provided //------------------------------------------------------------------------------ template inline __device__ void writeDofsOffset3d(BackendData& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx < P1d && data.tidy < P1d) for (CeedInt z = 0; z < P1d; ++z) { const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); } } //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ template inline __device__ void writeDofsStrided3d(BackendData& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { if (data.tidx < P1d && data.tidy < P1d) for (CeedInt z = 0; z < P1d; ++z) { const CeedInt node = data.tidx + data.tidy*P1d + z*P1d*P1d; const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; } } //------------------------------------------------------------------------------ // 3D tensor contract x //------------------------------------------------------------------------------ template inline __device__ void ContractX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[P1d]; for (CeedInt i = 0; i < P1d; ++i) r_B[i] = B[i + data.tidx*P1d]; for (CeedInt k = 0; k < P1d; ++k) { data.slice[data.tidx+data.tidy*Q1d] = U[k]; __syncthreads(); V[k] = 0.0; for (CeedInt i = 0; i < P1d; ++i) V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction __syncthreads(); } } //------------------------------------------------------------------------------ // 3D tensor contract y //------------------------------------------------------------------------------ template inline __device__ void ContractY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[P1d]; for (CeedInt i = 0; i < P1d; ++i) r_B[i] = B[i + data.tidy*P1d]; for (CeedInt k = 0; k < P1d; ++k) { data.slice[data.tidx+data.tidy*Q1d] = U[k]; __syncthreads(); V[k] = 0.0; for (CeedInt i = 0; i < P1d; ++i) V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction __syncthreads(); } } //------------------------------------------------------------------------------ // 3D tensor contract z //------------------------------------------------------------------------------ template inline __device__ void ContractZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { for (CeedInt k = 0; k < Q1d; ++k) { V[k] = 0.0; for (CeedInt i = 0; i < P1d; ++i) V[k] += B[i + k*P1d] * U[i]; // Contract z direction } } //------------------------------------------------------------------------------ // 3D transpose tensor contract z //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeZ3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { for (CeedInt k = 0; k < Q1d; ++k) { V[k] = 0.0; if (k < P1d) for (CeedInt i = 0; i < Q1d; ++i) V[k] += B[k + i*P1d] * U[i]; // Contract z direction } } //------------------------------------------------------------------------------ // 3D transpose tensor contract y //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q1d]; for (CeedInt i = 0; i < Q1d; ++i) r_B[i] = B[data.tidy + i*P1d]; for (CeedInt k = 0; k < P1d; ++k) { data.slice[data.tidx+data.tidy*Q1d] = U[k]; __syncthreads(); V[k] = 0.0; if (data.tidy < P1d) for (CeedInt i = 0; i < Q1d; ++i) V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction __syncthreads(); } } //------------------------------------------------------------------------------ // 3D transpose tensor contract add y //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeAddY3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q1d]; for (CeedInt i = 0; i < Q1d; ++i) r_B[i] = B[data.tidy + i*P1d]; for (CeedInt k = 0; k < P1d; ++k) { data.slice[data.tidx+data.tidy*Q1d] = U[k]; __syncthreads(); if (data.tidy < P1d) for (CeedInt i = 0; i < Q1d; ++i) V[k] += r_B[i] * data.slice[data.tidx + i*Q1d]; // Contract y direction __syncthreads(); } } //------------------------------------------------------------------------------ // 3D transpose tensor contract x //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q1d]; for (CeedInt i = 0; i < Q1d; ++i) r_B[i] = B[data.tidx + i*P1d]; for (CeedInt k = 0; k < P1d; ++k) { data.slice[data.tidx+data.tidy*Q1d] = U[k]; __syncthreads(); V[k] = 0.0; if (data.tidx < P1d) for (CeedInt i = 0; i < Q1d; ++i) V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction __syncthreads(); } } //------------------------------------------------------------------------------ // 3D transpose tensor contract add x //------------------------------------------------------------------------------ template inline __device__ void ContractTransposeAddX3d(BackendData& data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) { CeedScalar r_B[Q1d]; for (CeedInt i = 0; i < Q1d; ++i) r_B[i] = B[data.tidx + i*P1d]; for (CeedInt k = 0; k < P1d; ++k) { data.slice[data.tidx+data.tidy*Q1d] = U[k]; __syncthreads(); if (data.tidx < P1d) for (CeedInt i = 0; i < Q1d; ++i) V[k] += r_B[i] * data.slice[i + data.tidy*Q1d]; // Contract x direction __syncthreads(); } } //------------------------------------------------------------------------------ // 3D interpolate to quadrature points //------------------------------------------------------------------------------ template inline __device__ void interp3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[Q1d]; CeedScalar r_t2[Q1d]; for (CeedInt comp = 0; comp < NCOMP; comp++) { ContractX3d(data, r_U + comp*P1d, c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); ContractZ3d(data, r_t2, c_B, r_V + comp*Q1d); } } //------------------------------------------------------------------------------ // 3D interpolate transpose //------------------------------------------------------------------------------ template inline __device__ void interpTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[Q1d]; CeedScalar r_t2[Q1d]; for (CeedInt comp = 0; comp < NCOMP; comp++) { ContractTransposeZ3d(data, r_U + comp*Q1d, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); ContractTransposeX3d(data, r_t2, c_B, r_V + comp*P1d); } } //------------------------------------------------------------------------------ // 3D derivatives at quadrature points //------------------------------------------------------------------------------ template inline __device__ void grad3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[Q1d]; CeedScalar r_t2[Q1d]; for (CeedInt comp = 0; comp < NCOMP; comp++) { ContractX3d(data, r_U + comp*P1d, c_G, r_t1); ContractY3d(data, r_t1, c_B, r_t2); ContractZ3d(data, r_t2, c_B, r_V + comp*Q1d + 0*NCOMP*Q1d); ContractX3d(data, r_U + comp*P1d, c_B, r_t1); ContractY3d(data, r_t1, c_G, r_t2); ContractZ3d(data, r_t2, c_B, r_V + comp*Q1d + 1*NCOMP*Q1d); ContractX3d(data, r_U + comp*P1d, c_B, r_t1); ContractY3d(data, r_t1, c_B, r_t2); ContractZ3d(data, r_t2, c_G, r_V + comp*Q1d + 2*NCOMP*Q1d); } } //------------------------------------------------------------------------------ // 3D derivatives transpose //------------------------------------------------------------------------------ template inline __device__ void gradTranspose3d(BackendData& data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { CeedScalar r_t1[Q1d]; CeedScalar r_t2[Q1d]; for (CeedInt comp = 0; comp < NCOMP; comp++) { ContractTransposeZ3d(data, r_U + comp*Q1d + 0*NCOMP*Q1d, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); ContractTransposeX3d(data, r_t2, c_G, r_V + comp*P1d); ContractTransposeZ3d(data, r_U + comp*Q1d + 1*NCOMP*Q1d, c_B, r_t1); ContractTransposeY3d(data, r_t1, c_G, r_t2); ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp*P1d); ContractTransposeZ3d(data, r_U + comp*Q1d + 2*NCOMP*Q1d, c_G, r_t1); ContractTransposeY3d(data, r_t1, c_B, r_t2); ContractTransposeAddX3d(data, r_t2, c_B, r_V + comp*P1d); } } //------------------------------------------------------------------------------ // 3D collocated derivatives computation //------------------------------------------------------------------------------ template inline __device__ void gradCollo3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NCOMP; ++comp) { data.slice[data.tidx + data.tidy*Q1d] = r_U[q + comp*Q1d]; __syncthreads(); // X derivative r_V[comp+0*NCOMP] = 0.0; for (CeedInt i = 0; i < Q1d; ++i) r_V[comp+0*NCOMP] += c_G[i + data.tidx*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative) // Y derivative r_V[comp+1*NCOMP] = 0.0; for (CeedInt i = 0; i < Q1d; ++i) r_V[comp+1*NCOMP] += c_G[i + data.tidy*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative) // Z derivative r_V[comp+2*NCOMP] = 0.0; for (CeedInt i = 0; i < Q1d; ++i) r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) __syncthreads(); } } //------------------------------------------------------------------------------ // 3D collocated derivatives transpose //------------------------------------------------------------------------------ template inline __device__ void gradColloTranspose3d(BackendData& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { for (CeedInt comp = 0; comp < NCOMP; ++comp) { // X derivative data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 0*NCOMP]; __syncthreads(); for (CeedInt i = 0; i < Q1d; ++i) r_V[q+comp*Q1d] += c_G[data.tidx + i*Q1d] * data.slice[i + data.tidy*Q1d]; // Contract x direction (X derivative) __syncthreads(); // Y derivative data.slice[data.tidx + data.tidy*Q1d] = r_U[comp + 1*NCOMP]; __syncthreads(); for (CeedInt i = 0; i < Q1d; ++i) r_V[q+comp*Q1d] += c_G[data.tidy + i*Q1d] * data.slice[data.tidx + i*Q1d]; // Contract y direction (Y derivative) __syncthreads(); // Z derivative for (CeedInt i = 0; i < Q1d; ++i) r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) } } //------------------------------------------------------------------------------ // 1D quadrature weights //------------------------------------------------------------------------------ template inline __device__ void weight1d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { *w = qweight1d[data.tidx]; } //------------------------------------------------------------------------------ // 2D quadrature weights //------------------------------------------------------------------------------ template inline __device__ void weight2d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { *w = qweight1d[data.tidx]*qweight1d[data.tidy]; } //------------------------------------------------------------------------------ // 3D quadrature weights //------------------------------------------------------------------------------ template inline __device__ void weight3d(BackendData& data, const CeedScalar *qweight1d, CeedScalar *w) { const CeedScalar pw = qweight1d[data.tidx]*qweight1d[data.tidy]; for (CeedInt z = 0; z < Q1d; ++z) w[z] = pw*qweight1d[z]; } ); //------------------------------------------------------------------------------ // Build singe operator kernel //------------------------------------------------------------------------------ extern "C" int CeedCudaGenOperatorBuild(CeedOperator op) { using std::ostringstream; using std::string; int ierr; bool setupdone; ierr = CeedOperatorIsSetupDone(op, &setupdone); CeedChk(ierr); if (setupdone) return 0; Ceed ceed; ierr = CeedOperatorGetCeed(op, &ceed); CeedChk(ierr); CeedOperator_Cuda_gen *data; ierr = CeedOperatorGetData(op, (void**)&data); CeedChk(ierr); CeedQFunction qf; CeedQFunction_Cuda_gen *qf_data; ierr = CeedOperatorGetQFunction(op, &qf); CeedChk(ierr); ierr = CeedQFunctionGetData(qf, (void **)&qf_data); CeedChk(ierr); CeedInt Q, P1d, Q1d = 0, numelements, elemsize, numinputfields, numoutputfields, ncomp, dim = 0, lsize; ierr = CeedOperatorGetNumQuadraturePoints(op, &Q); CeedChk(ierr); ierr = CeedOperatorGetNumElements(op, &numelements); CeedChk(ierr); ierr = CeedQFunctionGetNumArgs(qf, &numinputfields, &numoutputfields); CeedChk(ierr); CeedOperatorField *opinputfields, *opoutputfields; ierr = CeedOperatorGetFields(op, &opinputfields, &opoutputfields); CeedChk(ierr); CeedQFunctionField *qfinputfields, *qfoutputfields; ierr = CeedQFunctionGetFields(qf, &qfinputfields, &qfoutputfields); CeedChk(ierr); CeedEvalMode emode; CeedBasis basis; CeedBasis_Cuda_shared *basis_data; CeedElemRestriction Erestrict; CeedElemRestriction_Cuda_reg *restr_data; ostringstream code; string devFunctions(deviceFunctions); // Add atomicAdd function for old NVidia architectures struct cudaDeviceProp prop; Ceed_Cuda *ceed_data; ierr = CeedGetData(ceed, (void **)&ceed_data); CeedChk(ierr); ierr = cudaGetDeviceProperties(&prop, ceed_data->deviceId); if (prop.major<6){ code << atomicAdd; } code << devFunctions; string qFunction(qf_data->qFunctionSource); code << "\n#define CEED_QFUNCTION(name) inline __device__ int name\n"; code << "\n#define CeedPragmaSIMD\n"; // Find dim and Q1d bool collograd = false; for (CeedInt i = 0; i < numinputfields; i++) { ierr = CeedOperatorFieldGetBasis(opinputfields[i], &basis); CeedChk(ierr); if (basis != CEED_BASIS_COLLOCATED) { ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); // Check for collocated gradient if (basis_data->d_collograd1d) collograd = true; // Collect dim and Q1d ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); bool isTensor; ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); if (isTensor) { ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); } else { return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); } } } // Check output bases for Q1d, dim as well // The only imput basis might be CEED_BASIS_COLLOCATED for (CeedInt i = 0; i < numoutputfields; i++) { ierr = CeedOperatorFieldGetBasis(opoutputfields[i], &basis); CeedChk(ierr); if (basis != CEED_BASIS_COLLOCATED) { ierr = CeedBasisGetData(basis, (void **)&basis_data); CeedChk(ierr); // Collect dim and Q1d ierr = CeedBasisGetDimension(basis, &dim); CeedChk(ierr); bool isTensor; ierr = CeedBasisIsTensor(basis, &isTensor); CeedChk(ierr); if (isTensor) { ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChk(ierr); } else { return CeedError(ceed, 1, "Backend does not implement operators with non-tensor basis"); } } } data->dim = dim; data->Q1d = Q1d; // Define CEED_Q_VLA if (dim != 3 || collograd) { code << "\n#define CEED_Q_VLA 1\n\n"; } else { code << "\n#define CEED_Q_VLA "<1?"*Q1d":"")<<";\n"; code << "\n // -- Input field constants and basis data --\n"; //Initialize constants, and matrices B and G for (CeedInt i = 0; i < numinputfields; i++) { code << " // ---- Input field "<B.in[i] = basis_data->d_interp1d; code << " __shared__ double s_B_in_"<(data, B.in["<B.in[i] = basis_data->d_interp1d; code << " __shared__ double s_B_in_"<(data, B.in["<d_collograd1d) { data->G.in[i] = basis_data->d_collograd1d; code << " __shared__ double s_G_in_"<(data, G.in["<G.in[i] = basis_data->d_grad1d; code << " __shared__ double s_G_in_"<(data, G.in["<B.out[i] = basis_data->d_interp1d; code << " __shared__ double s_B_out_"<(data, B.out["<B.out[i] = basis_data->d_interp1d; code << " __shared__ double s_B_out_"<(data, B.out["<d_collograd1d) { data->G.out[i] = basis_data->d_collograd1d; code << " __shared__ double s_G_out_"<(data, G.out["<G.out[i] = basis_data->d_grad1d; code << " __shared__ double s_G_out_"<(data, G.out["<d_collograd1d)) { code << " CeedScalar r_u"<indices.in[i] = restr_data->d_ind; code << " readDofsOffset"<(data, lsize_in_"<(data, elem, d_u"<d_collograd1d) { code << " CeedScalar* r_t"<(data, r_u"<d_collograd1d) { code << " CeedScalar r_t"<(data, r_u"<(data, r_u"<W = basis_data->d_qweight1d; code << " weight"<(data, W, r_t"<d_collograd1d) { //Accumulator for gradient slices code << " CeedScalar r_tt"<d_collograd1d) { code << "\n // Note: Collocated Gradient\n"; code << "#pragma unroll\n"; code << " for (CeedInt q=0; qindices.in[i] = restr_data->d_ind; code << " readSliceQuadsOffset"<<"3d(data, lsize_in_"<(data, elem, q, d_u"<(data, q, r_t"<qFunctionName); code << " "<d_collograd1d) { code << "1"; } else { code << "Q1d"; } code << ", in, out);\n"; if (basis_data->d_collograd1d) { code << "\n // Note: Collocated Gradient\n"; code << " // -- Output fields --\n"; for (CeedInt i = 0; i < numoutputfields; i++) { code << " // ---- Output field "<(data, q, r_qq"<(data, r_tt"<d_collograd1d) { code << " interpTranspose"<(data, r_tt"<(data, r_tt"<indices.out[i] = restr_data->d_ind; code << " writeDofsOffset"<(data, lsize_out_"<(data, elem, r_v"<module, 0); CeedChk(ierr); ierr = CeedGetKernelCuda(ceed, data->module, "oper", &data->op); CeedChk(ierr); ierr = CeedOperatorSetSetupDone(op); CeedChk(ierr); return 0; } //------------------------------------------------------------------------------