1*0d0321e0SJeremy L Thompson // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2*0d0321e0SJeremy L Thompson // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3*0d0321e0SJeremy L Thompson // All Rights reserved. See files LICENSE and NOTICE for details. 4*0d0321e0SJeremy L Thompson // 5*0d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software 6*0d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral 7*0d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and 8*0d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed. 9*0d0321e0SJeremy L Thompson // 10*0d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11*0d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office 12*0d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for 13*0d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including 14*0d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early 15*0d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative. 16*0d0321e0SJeremy L Thompson 17*0d0321e0SJeremy L Thompson #include <ceed/ceed.h> 18*0d0321e0SJeremy L Thompson #include <ceed/backend.h> 19*0d0321e0SJeremy L Thompson #include <cuda.h> 20*0d0321e0SJeremy L Thompson #include <cuda_runtime.h> 21*0d0321e0SJeremy L Thompson #include "ceed-cuda-ref.h" 22*0d0321e0SJeremy L Thompson #include "../cuda/ceed-cuda-compile.h" 23*0d0321e0SJeremy L Thompson 24*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 25*0d0321e0SJeremy L Thompson // Tensor Basis Kernels 26*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 27*0d0321e0SJeremy L Thompson // *INDENT-OFF* 28*0d0321e0SJeremy L Thompson static const char *basiskernels = QUOTE( 29*0d0321e0SJeremy L Thompson 30*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 31*0d0321e0SJeremy L Thompson // Interp 32*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 33*0d0321e0SJeremy L Thompson extern "C" __global__ void interp(const CeedInt nelem, const int transpose, 34*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ interp1d, 35*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ u, 36*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ v) { 37*0d0321e0SJeremy L Thompson const CeedInt i = threadIdx.x; 38*0d0321e0SJeremy L Thompson 39*0d0321e0SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q1D * BASIS_P1D + 2 * BASIS_BUF_LEN]; 40*0d0321e0SJeremy L Thompson CeedScalar *s_interp1d = s_mem; 41*0d0321e0SJeremy L Thompson CeedScalar *s_buf1 = s_mem + BASIS_Q1D * BASIS_P1D; 42*0d0321e0SJeremy L Thompson CeedScalar *s_buf2 = s_buf1 + BASIS_BUF_LEN; 43*0d0321e0SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q1D * BASIS_P1D; k += blockDim.x) { 44*0d0321e0SJeremy L Thompson s_interp1d[k] = interp1d[k]; 45*0d0321e0SJeremy L Thompson } 46*0d0321e0SJeremy L Thompson 47*0d0321e0SJeremy L Thompson const CeedInt P = transpose ? BASIS_Q1D : BASIS_P1D; 48*0d0321e0SJeremy L Thompson const CeedInt Q = transpose ? BASIS_P1D : BASIS_Q1D; 49*0d0321e0SJeremy L Thompson const CeedInt stride0 = transpose ? 1 : BASIS_P1D; 50*0d0321e0SJeremy L Thompson const CeedInt stride1 = transpose ? BASIS_P1D : 1; 51*0d0321e0SJeremy L Thompson const CeedInt u_stride = transpose ? BASIS_NQPT : BASIS_ELEMSIZE; 52*0d0321e0SJeremy L Thompson const CeedInt v_stride = transpose ? BASIS_ELEMSIZE : BASIS_NQPT; 53*0d0321e0SJeremy L Thompson const CeedInt u_comp_stride = nelem * (transpose ? BASIS_NQPT : BASIS_ELEMSIZE); 54*0d0321e0SJeremy L Thompson const CeedInt v_comp_stride = nelem * (transpose ? BASIS_ELEMSIZE : BASIS_NQPT); 55*0d0321e0SJeremy L Thompson const CeedInt u_size = transpose ? BASIS_NQPT : BASIS_ELEMSIZE; 56*0d0321e0SJeremy L Thompson 57*0d0321e0SJeremy L Thompson // Apply basis element by element 58*0d0321e0SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < nelem; elem += gridDim.x) { 59*0d0321e0SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NCOMP; ++comp) { 60*0d0321e0SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 61*0d0321e0SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 62*0d0321e0SJeremy L Thompson for (CeedInt k = i; k < u_size; k += blockDim.x) { 63*0d0321e0SJeremy L Thompson s_buf1[k] = cur_u[k]; 64*0d0321e0SJeremy L Thompson } 65*0d0321e0SJeremy L Thompson CeedInt pre = u_size; 66*0d0321e0SJeremy L Thompson CeedInt post = 1; 67*0d0321e0SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 68*0d0321e0SJeremy L Thompson __syncthreads(); 69*0d0321e0SJeremy L Thompson // Update buffers used 70*0d0321e0SJeremy L Thompson pre /= P; 71*0d0321e0SJeremy L Thompson const CeedScalar *in = d % 2 ? s_buf2 : s_buf1; 72*0d0321e0SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buf1 : s_buf2); 73*0d0321e0SJeremy L Thompson 74*0d0321e0SJeremy L Thompson // Contract along middle index 75*0d0321e0SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 76*0d0321e0SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 77*0d0321e0SJeremy L Thompson const CeedInt c = k % post; 78*0d0321e0SJeremy L Thompson const CeedInt j = (k / post) % Q; 79*0d0321e0SJeremy L Thompson const CeedInt a = k / (post * Q); 80*0d0321e0SJeremy L Thompson 81*0d0321e0SJeremy L Thompson CeedScalar vk = 0; 82*0d0321e0SJeremy L Thompson for (CeedInt b = 0; b < P; b++) 83*0d0321e0SJeremy L Thompson vk += s_interp1d[j*stride0 + b*stride1] * in[(a*P + b)*post + c]; 84*0d0321e0SJeremy L Thompson 85*0d0321e0SJeremy L Thompson out[k] = vk; 86*0d0321e0SJeremy L Thompson } 87*0d0321e0SJeremy L Thompson 88*0d0321e0SJeremy L Thompson post *= Q; 89*0d0321e0SJeremy L Thompson } 90*0d0321e0SJeremy L Thompson } 91*0d0321e0SJeremy L Thompson } 92*0d0321e0SJeremy L Thompson } 93*0d0321e0SJeremy L Thompson 94*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 95*0d0321e0SJeremy L Thompson // Grad 96*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 97*0d0321e0SJeremy L Thompson extern "C" __global__ void grad(const CeedInt nelem, const int transpose, 98*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ interp1d, 99*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ grad1d, 100*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ u, 101*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ v) { 102*0d0321e0SJeremy L Thompson const CeedInt i = threadIdx.x; 103*0d0321e0SJeremy L Thompson 104*0d0321e0SJeremy L Thompson __shared__ CeedScalar s_mem[2 * (BASIS_Q1D * BASIS_P1D + BASIS_BUF_LEN)]; 105*0d0321e0SJeremy L Thompson CeedScalar *s_interp1d = s_mem; 106*0d0321e0SJeremy L Thompson CeedScalar *s_grad1d = s_interp1d + BASIS_Q1D * BASIS_P1D; 107*0d0321e0SJeremy L Thompson CeedScalar *s_buf1 = s_grad1d + BASIS_Q1D * BASIS_P1D; 108*0d0321e0SJeremy L Thompson CeedScalar *s_buf2 = s_buf1 + BASIS_BUF_LEN; 109*0d0321e0SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q1D * BASIS_P1D; k += blockDim.x) { 110*0d0321e0SJeremy L Thompson s_interp1d[k] = interp1d[k]; 111*0d0321e0SJeremy L Thompson s_grad1d[k] = grad1d[k]; 112*0d0321e0SJeremy L Thompson } 113*0d0321e0SJeremy L Thompson 114*0d0321e0SJeremy L Thompson const CeedInt P = transpose ? BASIS_Q1D : BASIS_P1D; 115*0d0321e0SJeremy L Thompson const CeedInt Q = transpose ? BASIS_P1D : BASIS_Q1D; 116*0d0321e0SJeremy L Thompson const CeedInt stride0 = transpose ? 1 : BASIS_P1D; 117*0d0321e0SJeremy L Thompson const CeedInt stride1 = transpose ? BASIS_P1D : 1; 118*0d0321e0SJeremy L Thompson const CeedInt u_stride = transpose ? BASIS_NQPT : BASIS_ELEMSIZE; 119*0d0321e0SJeremy L Thompson const CeedInt v_stride = transpose ? BASIS_ELEMSIZE : BASIS_NQPT; 120*0d0321e0SJeremy L Thompson const CeedInt u_comp_stride = nelem * (transpose ? BASIS_NQPT : BASIS_ELEMSIZE); 121*0d0321e0SJeremy L Thompson const CeedInt v_comp_stride = nelem * (transpose ? BASIS_ELEMSIZE : BASIS_NQPT); 122*0d0321e0SJeremy L Thompson const CeedInt u_dim_stride = transpose ? nelem * BASIS_NQPT * BASIS_NCOMP : 0; 123*0d0321e0SJeremy L Thompson const CeedInt v_dim_stride = transpose ? 0 : nelem * BASIS_NQPT * BASIS_NCOMP; 124*0d0321e0SJeremy L Thompson 125*0d0321e0SJeremy L Thompson // Apply basis element by element 126*0d0321e0SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < nelem; elem += gridDim.x) { 127*0d0321e0SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NCOMP; ++comp) { 128*0d0321e0SJeremy L Thompson 129*0d0321e0SJeremy L Thompson // dim*dim contractions for grad 130*0d0321e0SJeremy L Thompson for (CeedInt dim1 = 0; dim1 < BASIS_DIM; dim1++) { 131*0d0321e0SJeremy L Thompson CeedInt pre = transpose ? BASIS_NQPT : BASIS_ELEMSIZE; 132*0d0321e0SJeremy L Thompson CeedInt post = 1; 133*0d0321e0SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + dim1 * u_dim_stride + 134*0d0321e0SJeremy L Thompson comp * u_comp_stride; 135*0d0321e0SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + dim1 * v_dim_stride + comp * 136*0d0321e0SJeremy L Thompson v_comp_stride; 137*0d0321e0SJeremy L Thompson for (CeedInt dim2 = 0; dim2 < BASIS_DIM; dim2++) { 138*0d0321e0SJeremy L Thompson __syncthreads(); 139*0d0321e0SJeremy L Thompson // Update buffers used 140*0d0321e0SJeremy L Thompson pre /= P; 141*0d0321e0SJeremy L Thompson const CeedScalar *op = dim1 == dim2 ? s_grad1d : s_interp1d; 142*0d0321e0SJeremy L Thompson const CeedScalar *in = dim2 == 0 ? cur_u : (dim2 % 2 ? s_buf2 : s_buf1); 143*0d0321e0SJeremy L Thompson CeedScalar *out = dim2 == BASIS_DIM - 1 ? cur_v : (dim2 % 2 ? s_buf1 : s_buf2); 144*0d0321e0SJeremy L Thompson 145*0d0321e0SJeremy L Thompson // Contract along middle index 146*0d0321e0SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 147*0d0321e0SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 148*0d0321e0SJeremy L Thompson const CeedInt c = k % post; 149*0d0321e0SJeremy L Thompson const CeedInt j = (k / post) % Q; 150*0d0321e0SJeremy L Thompson const CeedInt a = k / (post * Q); 151*0d0321e0SJeremy L Thompson CeedScalar vk = 0; 152*0d0321e0SJeremy L Thompson for (CeedInt b = 0; b < P; b++) 153*0d0321e0SJeremy L Thompson vk += op[j * stride0 + b * stride1] * in[(a * P + b) * post + c]; 154*0d0321e0SJeremy L Thompson 155*0d0321e0SJeremy L Thompson if (transpose && dim2 == BASIS_DIM - 1) 156*0d0321e0SJeremy L Thompson out[k] += vk; 157*0d0321e0SJeremy L Thompson else 158*0d0321e0SJeremy L Thompson out[k] = vk; 159*0d0321e0SJeremy L Thompson } 160*0d0321e0SJeremy L Thompson 161*0d0321e0SJeremy L Thompson post *= Q; 162*0d0321e0SJeremy L Thompson } 163*0d0321e0SJeremy L Thompson } 164*0d0321e0SJeremy L Thompson } 165*0d0321e0SJeremy L Thompson } 166*0d0321e0SJeremy L Thompson } 167*0d0321e0SJeremy L Thompson 168*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 169*0d0321e0SJeremy L Thompson // 1D quadrature weights 170*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 171*0d0321e0SJeremy L Thompson __device__ void weight1d(const CeedInt nelem, const CeedScalar *qweight1d, 172*0d0321e0SJeremy L Thompson CeedScalar *w) { 173*0d0321e0SJeremy L Thompson const int i = threadIdx.x; 174*0d0321e0SJeremy L Thompson if (i < BASIS_Q1D) { 175*0d0321e0SJeremy L Thompson const size_t elem = blockIdx.x; 176*0d0321e0SJeremy L Thompson if (elem < nelem) 177*0d0321e0SJeremy L Thompson w[elem*BASIS_Q1D + i] = qweight1d[i]; 178*0d0321e0SJeremy L Thompson } 179*0d0321e0SJeremy L Thompson } 180*0d0321e0SJeremy L Thompson 181*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 182*0d0321e0SJeremy L Thompson // 2D quadrature weights 183*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 184*0d0321e0SJeremy L Thompson __device__ void weight2d(const CeedInt nelem, const CeedScalar *qweight1d, 185*0d0321e0SJeremy L Thompson CeedScalar *w) { 186*0d0321e0SJeremy L Thompson 187*0d0321e0SJeremy L Thompson const int i = threadIdx.x; 188*0d0321e0SJeremy L Thompson const int j = threadIdx.y; 189*0d0321e0SJeremy L Thompson if (i < BASIS_Q1D && j < BASIS_Q1D) { 190*0d0321e0SJeremy L Thompson const size_t elem = blockIdx.x; 191*0d0321e0SJeremy L Thompson if (elem < nelem) { 192*0d0321e0SJeremy L Thompson const size_t ind = (elem * BASIS_Q1D + j) * BASIS_Q1D + i; 193*0d0321e0SJeremy L Thompson w[ind] = qweight1d[i] * qweight1d[j]; 194*0d0321e0SJeremy L Thompson } 195*0d0321e0SJeremy L Thompson } 196*0d0321e0SJeremy L Thompson } 197*0d0321e0SJeremy L Thompson 198*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 199*0d0321e0SJeremy L Thompson // 3D quadrature weights 200*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 201*0d0321e0SJeremy L Thompson __device__ void weight3d(const CeedInt nelem, const CeedScalar *qweight1d, 202*0d0321e0SJeremy L Thompson CeedScalar *w) { 203*0d0321e0SJeremy L Thompson const int i = threadIdx.x; 204*0d0321e0SJeremy L Thompson const int j = threadIdx.y; 205*0d0321e0SJeremy L Thompson if (i < BASIS_Q1D && j < BASIS_Q1D) { 206*0d0321e0SJeremy L Thompson const size_t elem = blockIdx.x; 207*0d0321e0SJeremy L Thompson if (elem < nelem) { 208*0d0321e0SJeremy L Thompson for (int k=0; k<BASIS_Q1D; k++) { 209*0d0321e0SJeremy L Thompson const size_t ind = ((elem * BASIS_Q1D + k) * BASIS_Q1D + j) * BASIS_Q1D + i; 210*0d0321e0SJeremy L Thompson w[ind] = qweight1d[i] * qweight1d[j] * qweight1d[k]; 211*0d0321e0SJeremy L Thompson } 212*0d0321e0SJeremy L Thompson } 213*0d0321e0SJeremy L Thompson } 214*0d0321e0SJeremy L Thompson } 215*0d0321e0SJeremy L Thompson 216*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 217*0d0321e0SJeremy L Thompson // Quadrature weights 218*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 219*0d0321e0SJeremy L Thompson extern "C" __global__ void weight(const CeedInt nelem, 220*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ qweight1d, 221*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ v) { 222*0d0321e0SJeremy L Thompson if (BASIS_DIM==1) 223*0d0321e0SJeremy L Thompson weight1d(nelem, qweight1d, v); 224*0d0321e0SJeremy L Thompson else if (BASIS_DIM==2) 225*0d0321e0SJeremy L Thompson weight2d(nelem, qweight1d, v); 226*0d0321e0SJeremy L Thompson else if (BASIS_DIM==3) 227*0d0321e0SJeremy L Thompson weight3d(nelem, qweight1d, v); 228*0d0321e0SJeremy L Thompson } 229*0d0321e0SJeremy L Thompson 230*0d0321e0SJeremy L Thompson ); 231*0d0321e0SJeremy L Thompson 232*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 233*0d0321e0SJeremy L Thompson // Non-Tensor Basis Kernels 234*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 235*0d0321e0SJeremy L Thompson static const char *kernelsNonTensorRef = QUOTE( 236*0d0321e0SJeremy L Thompson 237*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 238*0d0321e0SJeremy L Thompson // Interp 239*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 240*0d0321e0SJeremy L Thompson extern "C" __global__ void interp(const CeedInt nelem, const int transpose, 241*0d0321e0SJeremy L Thompson const CeedScalar *d_B, 242*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ d_U, 243*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ d_V) { 244*0d0321e0SJeremy L Thompson const int tid = threadIdx.x; 245*0d0321e0SJeremy L Thompson 246*0d0321e0SJeremy L Thompson const CeedScalar *U; 247*0d0321e0SJeremy L Thompson CeedScalar V; 248*0d0321e0SJeremy L Thompson //TODO load B in shared memory if blockDim.z > 1? 249*0d0321e0SJeremy L Thompson 250*0d0321e0SJeremy L Thompson for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; 251*0d0321e0SJeremy L Thompson elem += gridDim.x*blockDim.z) { 252*0d0321e0SJeremy L Thompson for (int comp = 0; comp < BASIS_NCOMP; comp++) { 253*0d0321e0SJeremy L Thompson if (!transpose) { // run with Q threads 254*0d0321e0SJeremy L Thompson U = d_U + elem*P + comp*nelem*P; 255*0d0321e0SJeremy L Thompson V = 0.0; 256*0d0321e0SJeremy L Thompson for (int i = 0; i < P; ++i) 257*0d0321e0SJeremy L Thompson V += d_B[i + tid*P]*U[i]; 258*0d0321e0SJeremy L Thompson 259*0d0321e0SJeremy L Thompson d_V[elem*Q + comp*nelem*Q + tid] = V; 260*0d0321e0SJeremy L Thompson } else { // run with P threads 261*0d0321e0SJeremy L Thompson U = d_U + elem*Q + comp*nelem*Q; 262*0d0321e0SJeremy L Thompson V = 0.0; 263*0d0321e0SJeremy L Thompson for (int i = 0; i < Q; ++i) 264*0d0321e0SJeremy L Thompson V += d_B[tid + i*P]*U[i]; 265*0d0321e0SJeremy L Thompson 266*0d0321e0SJeremy L Thompson d_V[elem*P + comp*nelem*P + tid] = V; 267*0d0321e0SJeremy L Thompson } 268*0d0321e0SJeremy L Thompson } 269*0d0321e0SJeremy L Thompson } 270*0d0321e0SJeremy L Thompson } 271*0d0321e0SJeremy L Thompson 272*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 273*0d0321e0SJeremy L Thompson // Grad 274*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 275*0d0321e0SJeremy L Thompson extern "C" __global__ void grad(const CeedInt nelem, const int transpose, 276*0d0321e0SJeremy L Thompson const CeedScalar *d_G, 277*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ d_U, 278*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ d_V) { 279*0d0321e0SJeremy L Thompson const int tid = threadIdx.x; 280*0d0321e0SJeremy L Thompson 281*0d0321e0SJeremy L Thompson const CeedScalar *U; 282*0d0321e0SJeremy L Thompson //TODO load G in shared memory if blockDim.z > 1? 283*0d0321e0SJeremy L Thompson 284*0d0321e0SJeremy L Thompson for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; 285*0d0321e0SJeremy L Thompson elem += gridDim.x*blockDim.z) { 286*0d0321e0SJeremy L Thompson for (int comp=0; comp<BASIS_NCOMP; comp++) { 287*0d0321e0SJeremy L Thompson if (!transpose) { // run with Q threads 288*0d0321e0SJeremy L Thompson CeedScalar V[BASIS_DIM]; 289*0d0321e0SJeremy L Thompson U = d_U + elem*P + comp*nelem*P; 290*0d0321e0SJeremy L Thompson for (int dim = 0; dim < BASIS_DIM; dim++) 291*0d0321e0SJeremy L Thompson V[dim] = 0.0; 292*0d0321e0SJeremy L Thompson 293*0d0321e0SJeremy L Thompson for (int i = 0; i < P; ++i) { 294*0d0321e0SJeremy L Thompson const CeedScalar val = U[i]; 295*0d0321e0SJeremy L Thompson for(int dim = 0; dim < BASIS_DIM; dim++) 296*0d0321e0SJeremy L Thompson V[dim] += d_G[i + tid*P + dim*P*Q]*val; 297*0d0321e0SJeremy L Thompson } 298*0d0321e0SJeremy L Thompson for (int dim = 0; dim < BASIS_DIM; dim++) { 299*0d0321e0SJeremy L Thompson d_V[elem*Q + comp*nelem*Q + dim*BASIS_NCOMP*nelem*Q + tid] = V[dim]; 300*0d0321e0SJeremy L Thompson } 301*0d0321e0SJeremy L Thompson } else { // run with P threads 302*0d0321e0SJeremy L Thompson CeedScalar V = 0.0; 303*0d0321e0SJeremy L Thompson for (int dim = 0; dim < BASIS_DIM; dim++) { 304*0d0321e0SJeremy L Thompson U = d_U + elem*Q + comp*nelem*Q +dim*BASIS_NCOMP*nelem*Q; 305*0d0321e0SJeremy L Thompson for (int i = 0; i < Q; ++i) 306*0d0321e0SJeremy L Thompson V += d_G[tid + i*P + dim*P*Q]*U[i]; 307*0d0321e0SJeremy L Thompson } 308*0d0321e0SJeremy L Thompson d_V[elem*P + comp*nelem*P + tid] = V; 309*0d0321e0SJeremy L Thompson } 310*0d0321e0SJeremy L Thompson } 311*0d0321e0SJeremy L Thompson } 312*0d0321e0SJeremy L Thompson } 313*0d0321e0SJeremy L Thompson 314*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 315*0d0321e0SJeremy L Thompson // Weight 316*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 317*0d0321e0SJeremy L Thompson extern "C" __global__ void weight(const CeedInt nelem, 318*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ qweight, 319*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ d_V) { 320*0d0321e0SJeremy L Thompson const int tid = threadIdx.x; 321*0d0321e0SJeremy L Thompson //TODO load qweight in shared memory if blockDim.z > 1? 322*0d0321e0SJeremy L Thompson for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; 323*0d0321e0SJeremy L Thompson elem += gridDim.x*blockDim.z) { 324*0d0321e0SJeremy L Thompson d_V[elem*Q + tid] = qweight[tid]; 325*0d0321e0SJeremy L Thompson } 326*0d0321e0SJeremy L Thompson } 327*0d0321e0SJeremy L Thompson 328*0d0321e0SJeremy L Thompson ); 329*0d0321e0SJeremy L Thompson // *INDENT-ON* 330*0d0321e0SJeremy L Thompson 331*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 332*0d0321e0SJeremy L Thompson // Basis apply - tensor 333*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 334*0d0321e0SJeremy L Thompson int CeedBasisApply_Cuda(CeedBasis basis, const CeedInt nelem, 335*0d0321e0SJeremy L Thompson CeedTransposeMode tmode, 336*0d0321e0SJeremy L Thompson CeedEvalMode emode, CeedVector u, CeedVector v) { 337*0d0321e0SJeremy L Thompson int ierr; 338*0d0321e0SJeremy L Thompson Ceed ceed; 339*0d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 340*0d0321e0SJeremy L Thompson Ceed_Cuda *ceed_Cuda; 341*0d0321e0SJeremy L Thompson ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 342*0d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 343*0d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 344*0d0321e0SJeremy L Thompson const CeedInt transpose = tmode == CEED_TRANSPOSE; 345*0d0321e0SJeremy L Thompson const int maxblocksize = 32; 346*0d0321e0SJeremy L Thompson 347*0d0321e0SJeremy L Thompson // Read vectors 348*0d0321e0SJeremy L Thompson const CeedScalar *d_u; 349*0d0321e0SJeremy L Thompson CeedScalar *d_v; 350*0d0321e0SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { 351*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 352*0d0321e0SJeremy L Thompson } 353*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 354*0d0321e0SJeremy L Thompson 355*0d0321e0SJeremy L Thompson // Clear v for transpose operation 356*0d0321e0SJeremy L Thompson if (tmode == CEED_TRANSPOSE) { 357*0d0321e0SJeremy L Thompson CeedInt length; 358*0d0321e0SJeremy L Thompson ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 359*0d0321e0SJeremy L Thompson ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); 360*0d0321e0SJeremy L Thompson CeedChk_Cu(ceed,ierr); 361*0d0321e0SJeremy L Thompson } 362*0d0321e0SJeremy L Thompson CeedInt Q1d, dim; 363*0d0321e0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 364*0d0321e0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 365*0d0321e0SJeremy L Thompson 366*0d0321e0SJeremy L Thompson // Basis action 367*0d0321e0SJeremy L Thompson switch (emode) { 368*0d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 369*0d0321e0SJeremy L Thompson void *interpargs[] = {(void *) &nelem, (void *) &transpose, 370*0d0321e0SJeremy L Thompson &data->d_interp1d, &d_u, &d_v 371*0d0321e0SJeremy L Thompson }; 372*0d0321e0SJeremy L Thompson CeedInt blocksize = CeedIntPow(Q1d, dim); 373*0d0321e0SJeremy L Thompson blocksize = blocksize > maxblocksize ? maxblocksize : blocksize; 374*0d0321e0SJeremy L Thompson 375*0d0321e0SJeremy L Thompson ierr = CeedRunKernelCuda(ceed, data->interp, nelem, blocksize, interpargs); 376*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 377*0d0321e0SJeremy L Thompson } break; 378*0d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 379*0d0321e0SJeremy L Thompson void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->d_interp1d, 380*0d0321e0SJeremy L Thompson &data->d_grad1d, &d_u, &d_v 381*0d0321e0SJeremy L Thompson }; 382*0d0321e0SJeremy L Thompson CeedInt blocksize = maxblocksize; 383*0d0321e0SJeremy L Thompson 384*0d0321e0SJeremy L Thompson ierr = CeedRunKernelCuda(ceed, data->grad, nelem, blocksize, gradargs); 385*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 386*0d0321e0SJeremy L Thompson } break; 387*0d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 388*0d0321e0SJeremy L Thompson void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight1d, &d_v}; 389*0d0321e0SJeremy L Thompson const int gridsize = nelem; 390*0d0321e0SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->weight, gridsize, 391*0d0321e0SJeremy L Thompson Q1d, dim >= 2 ? Q1d : 1, 1, 392*0d0321e0SJeremy L Thompson weightargs); CeedChkBackend(ierr); 393*0d0321e0SJeremy L Thompson } break; 394*0d0321e0SJeremy L Thompson // LCOV_EXCL_START 395*0d0321e0SJeremy L Thompson // Evaluate the divergence to/from the quadrature points 396*0d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 397*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 398*0d0321e0SJeremy L Thompson // Evaluate the curl to/from the quadrature points 399*0d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 400*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 401*0d0321e0SJeremy L Thompson // Take no action, BasisApply should not have been called 402*0d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 403*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 404*0d0321e0SJeremy L Thompson "CEED_EVAL_NONE does not make sense in this context"); 405*0d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 406*0d0321e0SJeremy L Thompson } 407*0d0321e0SJeremy L Thompson 408*0d0321e0SJeremy L Thompson // Restore vectors 409*0d0321e0SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { 410*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 411*0d0321e0SJeremy L Thompson } 412*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 413*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 414*0d0321e0SJeremy L Thompson } 415*0d0321e0SJeremy L Thompson 416*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 417*0d0321e0SJeremy L Thompson // Basis apply - non-tensor 418*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 419*0d0321e0SJeremy L Thompson int CeedBasisApplyNonTensor_Cuda(CeedBasis basis, const CeedInt nelem, 420*0d0321e0SJeremy L Thompson CeedTransposeMode tmode, CeedEvalMode emode, 421*0d0321e0SJeremy L Thompson CeedVector u, CeedVector v) { 422*0d0321e0SJeremy L Thompson int ierr; 423*0d0321e0SJeremy L Thompson Ceed ceed; 424*0d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 425*0d0321e0SJeremy L Thompson Ceed_Cuda *ceed_Cuda; 426*0d0321e0SJeremy L Thompson ierr = CeedGetData(ceed, &ceed_Cuda); CeedChkBackend(ierr); 427*0d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 428*0d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 429*0d0321e0SJeremy L Thompson CeedInt nnodes, nqpt; 430*0d0321e0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr); 431*0d0321e0SJeremy L Thompson ierr = CeedBasisGetNumNodes(basis, &nnodes); CeedChkBackend(ierr); 432*0d0321e0SJeremy L Thompson const CeedInt transpose = tmode == CEED_TRANSPOSE; 433*0d0321e0SJeremy L Thompson int elemsPerBlock = 1; 434*0d0321e0SJeremy L Thompson int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0); 435*0d0321e0SJeremy L Thompson 436*0d0321e0SJeremy L Thompson // Read vectors 437*0d0321e0SJeremy L Thompson const CeedScalar *d_u; 438*0d0321e0SJeremy L Thompson CeedScalar *d_v; 439*0d0321e0SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { 440*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 441*0d0321e0SJeremy L Thompson } 442*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 443*0d0321e0SJeremy L Thompson 444*0d0321e0SJeremy L Thompson // Clear v for transpose operation 445*0d0321e0SJeremy L Thompson if (tmode == CEED_TRANSPOSE) { 446*0d0321e0SJeremy L Thompson CeedInt length; 447*0d0321e0SJeremy L Thompson ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 448*0d0321e0SJeremy L Thompson ierr = cudaMemset(d_v, 0, length * sizeof(CeedScalar)); 449*0d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 450*0d0321e0SJeremy L Thompson } 451*0d0321e0SJeremy L Thompson 452*0d0321e0SJeremy L Thompson // Apply basis operation 453*0d0321e0SJeremy L Thompson switch (emode) { 454*0d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 455*0d0321e0SJeremy L Thompson void *interpargs[] = {(void *) &nelem, (void *) &transpose, 456*0d0321e0SJeremy L Thompson &data->d_interp, &d_u, &d_v 457*0d0321e0SJeremy L Thompson }; 458*0d0321e0SJeremy L Thompson if (!transpose) { 459*0d0321e0SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->interp, grid, nqpt, 1, 460*0d0321e0SJeremy L Thompson elemsPerBlock, interpargs); CeedChkBackend(ierr); 461*0d0321e0SJeremy L Thompson } else { 462*0d0321e0SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->interp, grid, nnodes, 1, 463*0d0321e0SJeremy L Thompson elemsPerBlock, interpargs); CeedChkBackend(ierr); 464*0d0321e0SJeremy L Thompson } 465*0d0321e0SJeremy L Thompson } break; 466*0d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 467*0d0321e0SJeremy L Thompson void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->d_grad, 468*0d0321e0SJeremy L Thompson &d_u, &d_v 469*0d0321e0SJeremy L Thompson }; 470*0d0321e0SJeremy L Thompson if (!transpose) { 471*0d0321e0SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->grad, grid, nqpt, 1, 472*0d0321e0SJeremy L Thompson elemsPerBlock, gradargs); CeedChkBackend(ierr); 473*0d0321e0SJeremy L Thompson } else { 474*0d0321e0SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->grad, grid, nnodes, 1, 475*0d0321e0SJeremy L Thompson elemsPerBlock, gradargs); CeedChkBackend(ierr); 476*0d0321e0SJeremy L Thompson } 477*0d0321e0SJeremy L Thompson } break; 478*0d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 479*0d0321e0SJeremy L Thompson void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight, &d_v}; 480*0d0321e0SJeremy L Thompson ierr = CeedRunKernelDimCuda(ceed, data->weight, grid, nqpt, 1, 481*0d0321e0SJeremy L Thompson elemsPerBlock, weightargs); CeedChkBackend(ierr); 482*0d0321e0SJeremy L Thompson } break; 483*0d0321e0SJeremy L Thompson // LCOV_EXCL_START 484*0d0321e0SJeremy L Thompson // Evaluate the divergence to/from the quadrature points 485*0d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 486*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 487*0d0321e0SJeremy L Thompson // Evaluate the curl to/from the quadrature points 488*0d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 489*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 490*0d0321e0SJeremy L Thompson // Take no action, BasisApply should not have been called 491*0d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 492*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 493*0d0321e0SJeremy L Thompson "CEED_EVAL_NONE does not make sense in this context"); 494*0d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 495*0d0321e0SJeremy L Thompson } 496*0d0321e0SJeremy L Thompson 497*0d0321e0SJeremy L Thompson // Restore vectors 498*0d0321e0SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { 499*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 500*0d0321e0SJeremy L Thompson } 501*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 502*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 503*0d0321e0SJeremy L Thompson } 504*0d0321e0SJeremy L Thompson 505*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 506*0d0321e0SJeremy L Thompson // Destroy tensor basis 507*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 508*0d0321e0SJeremy L Thompson static int CeedBasisDestroy_Cuda(CeedBasis basis) { 509*0d0321e0SJeremy L Thompson int ierr; 510*0d0321e0SJeremy L Thompson Ceed ceed; 511*0d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 512*0d0321e0SJeremy L Thompson 513*0d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 514*0d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 515*0d0321e0SJeremy L Thompson 516*0d0321e0SJeremy L Thompson CeedChk_Cu(ceed, cuModuleUnload(data->module)); 517*0d0321e0SJeremy L Thompson 518*0d0321e0SJeremy L Thompson ierr = cudaFree(data->d_qweight1d); CeedChk_Cu(ceed,ierr); 519*0d0321e0SJeremy L Thompson ierr = cudaFree(data->d_interp1d); CeedChk_Cu(ceed,ierr); 520*0d0321e0SJeremy L Thompson ierr = cudaFree(data->d_grad1d); CeedChk_Cu(ceed,ierr); 521*0d0321e0SJeremy L Thompson 522*0d0321e0SJeremy L Thompson ierr = CeedFree(&data); CeedChkBackend(ierr); 523*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 524*0d0321e0SJeremy L Thompson } 525*0d0321e0SJeremy L Thompson 526*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 527*0d0321e0SJeremy L Thompson // Destroy non-tensor basis 528*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 529*0d0321e0SJeremy L Thompson static int CeedBasisDestroyNonTensor_Cuda(CeedBasis basis) { 530*0d0321e0SJeremy L Thompson int ierr; 531*0d0321e0SJeremy L Thompson Ceed ceed; 532*0d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 533*0d0321e0SJeremy L Thompson 534*0d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 535*0d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 536*0d0321e0SJeremy L Thompson 537*0d0321e0SJeremy L Thompson CeedChk_Cu(ceed, cuModuleUnload(data->module)); 538*0d0321e0SJeremy L Thompson 539*0d0321e0SJeremy L Thompson ierr = cudaFree(data->d_qweight); CeedChk_Cu(ceed, ierr); 540*0d0321e0SJeremy L Thompson ierr = cudaFree(data->d_interp); CeedChk_Cu(ceed, ierr); 541*0d0321e0SJeremy L Thompson ierr = cudaFree(data->d_grad); CeedChk_Cu(ceed, ierr); 542*0d0321e0SJeremy L Thompson 543*0d0321e0SJeremy L Thompson ierr = CeedFree(&data); CeedChkBackend(ierr); 544*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 545*0d0321e0SJeremy L Thompson } 546*0d0321e0SJeremy L Thompson 547*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 548*0d0321e0SJeremy L Thompson // Create tensor 549*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 550*0d0321e0SJeremy L Thompson int CeedBasisCreateTensorH1_Cuda(CeedInt dim, CeedInt P1d, CeedInt Q1d, 551*0d0321e0SJeremy L Thompson const CeedScalar *interp1d, 552*0d0321e0SJeremy L Thompson const CeedScalar *grad1d, 553*0d0321e0SJeremy L Thompson const CeedScalar *qref1d, 554*0d0321e0SJeremy L Thompson const CeedScalar *qweight1d, 555*0d0321e0SJeremy L Thompson CeedBasis basis) { 556*0d0321e0SJeremy L Thompson int ierr; 557*0d0321e0SJeremy L Thompson Ceed ceed; 558*0d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 559*0d0321e0SJeremy L Thompson CeedBasis_Cuda *data; 560*0d0321e0SJeremy L Thompson ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 561*0d0321e0SJeremy L Thompson 562*0d0321e0SJeremy L Thompson // Copy data to GPU 563*0d0321e0SJeremy L Thompson const CeedInt qBytes = Q1d * sizeof(CeedScalar); 564*0d0321e0SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_qweight1d, qBytes); CeedChk_Cu(ceed,ierr); 565*0d0321e0SJeremy L Thompson ierr = cudaMemcpy(data->d_qweight1d, qweight1d, qBytes, 566*0d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed,ierr); 567*0d0321e0SJeremy L Thompson 568*0d0321e0SJeremy L Thompson const CeedInt iBytes = qBytes * P1d; 569*0d0321e0SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_interp1d, iBytes); CeedChk_Cu(ceed,ierr); 570*0d0321e0SJeremy L Thompson ierr = cudaMemcpy(data->d_interp1d, interp1d, iBytes, 571*0d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed,ierr); 572*0d0321e0SJeremy L Thompson 573*0d0321e0SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_grad1d, iBytes); CeedChk_Cu(ceed,ierr); 574*0d0321e0SJeremy L Thompson ierr = cudaMemcpy(data->d_grad1d, grad1d, iBytes, 575*0d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed,ierr); 576*0d0321e0SJeremy L Thompson 577*0d0321e0SJeremy L Thompson // Complie basis kernels 578*0d0321e0SJeremy L Thompson CeedInt ncomp; 579*0d0321e0SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 580*0d0321e0SJeremy L Thompson ierr = CeedCompileCuda(ceed, basiskernels, &data->module, 7, 581*0d0321e0SJeremy L Thompson "BASIS_Q1D", Q1d, 582*0d0321e0SJeremy L Thompson "BASIS_P1D", P1d, 583*0d0321e0SJeremy L Thompson "BASIS_BUF_LEN", ncomp * CeedIntPow(Q1d > P1d ? 584*0d0321e0SJeremy L Thompson Q1d : P1d, dim), 585*0d0321e0SJeremy L Thompson "BASIS_DIM", dim, 586*0d0321e0SJeremy L Thompson "BASIS_NCOMP", ncomp, 587*0d0321e0SJeremy L Thompson "BASIS_ELEMSIZE", CeedIntPow(P1d, dim), 588*0d0321e0SJeremy L Thompson "BASIS_NQPT", CeedIntPow(Q1d, dim) 589*0d0321e0SJeremy L Thompson ); CeedChkBackend(ierr); 590*0d0321e0SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "interp", &data->interp); 591*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 592*0d0321e0SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "grad", &data->grad); 593*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 594*0d0321e0SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "weight", &data->weight); 595*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 596*0d0321e0SJeremy L Thompson ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 597*0d0321e0SJeremy L Thompson 598*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 599*0d0321e0SJeremy L Thompson CeedBasisApply_Cuda); CeedChkBackend(ierr); 600*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 601*0d0321e0SJeremy L Thompson CeedBasisDestroy_Cuda); CeedChkBackend(ierr); 602*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 603*0d0321e0SJeremy L Thompson } 604*0d0321e0SJeremy L Thompson 605*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 606*0d0321e0SJeremy L Thompson // Create non-tensor 607*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 608*0d0321e0SJeremy L Thompson int CeedBasisCreateH1_Cuda(CeedElemTopology topo, CeedInt dim, CeedInt nnodes, 609*0d0321e0SJeremy L Thompson CeedInt nqpts, const CeedScalar *interp, 610*0d0321e0SJeremy L Thompson const CeedScalar *grad, const CeedScalar *qref, 611*0d0321e0SJeremy L Thompson const CeedScalar *qweight, CeedBasis basis) { 612*0d0321e0SJeremy L Thompson int ierr; 613*0d0321e0SJeremy L Thompson Ceed ceed; 614*0d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 615*0d0321e0SJeremy L Thompson CeedBasisNonTensor_Cuda *data; 616*0d0321e0SJeremy L Thompson ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 617*0d0321e0SJeremy L Thompson 618*0d0321e0SJeremy L Thompson // Copy basis data to GPU 619*0d0321e0SJeremy L Thompson const CeedInt qBytes = nqpts * sizeof(CeedScalar); 620*0d0321e0SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_qweight, qBytes); CeedChk_Cu(ceed, ierr); 621*0d0321e0SJeremy L Thompson ierr = cudaMemcpy(data->d_qweight, qweight, qBytes, 622*0d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 623*0d0321e0SJeremy L Thompson 624*0d0321e0SJeremy L Thompson const CeedInt iBytes = qBytes * nnodes; 625*0d0321e0SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_interp, iBytes); CeedChk_Cu(ceed, ierr); 626*0d0321e0SJeremy L Thompson ierr = cudaMemcpy(data->d_interp, interp, iBytes, 627*0d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 628*0d0321e0SJeremy L Thompson 629*0d0321e0SJeremy L Thompson const CeedInt gBytes = qBytes * nnodes * dim; 630*0d0321e0SJeremy L Thompson ierr = cudaMalloc((void **)&data->d_grad, gBytes); CeedChk_Cu(ceed, ierr); 631*0d0321e0SJeremy L Thompson ierr = cudaMemcpy(data->d_grad, grad, gBytes, 632*0d0321e0SJeremy L Thompson cudaMemcpyHostToDevice); CeedChk_Cu(ceed, ierr); 633*0d0321e0SJeremy L Thompson 634*0d0321e0SJeremy L Thompson // Compile basis kernels 635*0d0321e0SJeremy L Thompson CeedInt ncomp; 636*0d0321e0SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 637*0d0321e0SJeremy L Thompson ierr = CeedCompileCuda(ceed, kernelsNonTensorRef, &data->module, 4, 638*0d0321e0SJeremy L Thompson "Q", nqpts, 639*0d0321e0SJeremy L Thompson "P", nnodes, 640*0d0321e0SJeremy L Thompson "BASIS_DIM", dim, 641*0d0321e0SJeremy L Thompson "BASIS_NCOMP", ncomp 642*0d0321e0SJeremy L Thompson ); CeedChk_Cu(ceed, ierr); 643*0d0321e0SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "interp", &data->interp); 644*0d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 645*0d0321e0SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "grad", &data->grad); 646*0d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 647*0d0321e0SJeremy L Thompson ierr = CeedGetKernelCuda(ceed, data->module, "weight", &data->weight); 648*0d0321e0SJeremy L Thompson CeedChk_Cu(ceed, ierr); 649*0d0321e0SJeremy L Thompson 650*0d0321e0SJeremy L Thompson ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 651*0d0321e0SJeremy L Thompson 652*0d0321e0SJeremy L Thompson // Register backend functions 653*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 654*0d0321e0SJeremy L Thompson CeedBasisApplyNonTensor_Cuda); CeedChkBackend(ierr); 655*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 656*0d0321e0SJeremy L Thompson CeedBasisDestroyNonTensor_Cuda); CeedChkBackend(ierr); 657*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 658*0d0321e0SJeremy L Thompson } 659*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 660