1*0d0321e0SJeremy L Thompson // Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC. 2*0d0321e0SJeremy L Thompson // Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707. 3*0d0321e0SJeremy L Thompson // All Rights reserved. See files LICENSE and NOTICE for details. 4*0d0321e0SJeremy L Thompson // 5*0d0321e0SJeremy L Thompson // This file is part of CEED, a collection of benchmarks, miniapps, software 6*0d0321e0SJeremy L Thompson // libraries and APIs for efficient high-order finite element and spectral 7*0d0321e0SJeremy L Thompson // element discretizations for exascale applications. For more information and 8*0d0321e0SJeremy L Thompson // source code availability see http://github.com/ceed. 9*0d0321e0SJeremy L Thompson // 10*0d0321e0SJeremy L Thompson // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC, 11*0d0321e0SJeremy L Thompson // a collaborative effort of two U.S. Department of Energy organizations (Office 12*0d0321e0SJeremy L Thompson // of Science and the National Nuclear Security Administration) responsible for 13*0d0321e0SJeremy L Thompson // the planning and preparation of a capable exascale ecosystem, including 14*0d0321e0SJeremy L Thompson // software, applications, hardware, advanced system engineering and early 15*0d0321e0SJeremy L Thompson // testbed platforms, in support of the nation's exascale computing imperative. 16*0d0321e0SJeremy L Thompson 17*0d0321e0SJeremy L Thompson #include <ceed/ceed.h> 18*0d0321e0SJeremy L Thompson #include <ceed/backend.h> 19*0d0321e0SJeremy L Thompson #include <hip/hip_runtime.h> 20*0d0321e0SJeremy L Thompson #include "ceed-hip-ref.h" 21*0d0321e0SJeremy L Thompson #include "../hip/ceed-hip-compile.h" 22*0d0321e0SJeremy L Thompson 23*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 24*0d0321e0SJeremy L Thompson // Tensor Basis Kernels 25*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 26*0d0321e0SJeremy L Thompson // *INDENT-OFF* 27*0d0321e0SJeremy L Thompson static const char *basiskernels = QUOTE( 28*0d0321e0SJeremy L Thompson 29*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 30*0d0321e0SJeremy L Thompson // Interp 31*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 32*0d0321e0SJeremy L Thompson extern "C" __global__ void interp(const CeedInt nelem, const int transpose, 33*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ interp1d, 34*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ u, 35*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ v) { 36*0d0321e0SJeremy L Thompson const CeedInt i = threadIdx.x; 37*0d0321e0SJeremy L Thompson 38*0d0321e0SJeremy L Thompson __shared__ CeedScalar s_mem[BASIS_Q1D * BASIS_P1D + 2 * BASIS_BUF_LEN]; 39*0d0321e0SJeremy L Thompson CeedScalar *s_interp1d = s_mem; 40*0d0321e0SJeremy L Thompson CeedScalar *s_buf1 = s_mem + BASIS_Q1D * BASIS_P1D; 41*0d0321e0SJeremy L Thompson CeedScalar *s_buf2 = s_buf1 + BASIS_BUF_LEN; 42*0d0321e0SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q1D * BASIS_P1D; k += blockDim.x) { 43*0d0321e0SJeremy L Thompson s_interp1d[k] = interp1d[k]; 44*0d0321e0SJeremy L Thompson } 45*0d0321e0SJeremy L Thompson 46*0d0321e0SJeremy L Thompson const CeedInt P = transpose ? BASIS_Q1D : BASIS_P1D; 47*0d0321e0SJeremy L Thompson const CeedInt Q = transpose ? BASIS_P1D : BASIS_Q1D; 48*0d0321e0SJeremy L Thompson const CeedInt stride0 = transpose ? 1 : BASIS_P1D; 49*0d0321e0SJeremy L Thompson const CeedInt stride1 = transpose ? BASIS_P1D : 1; 50*0d0321e0SJeremy L Thompson const CeedInt u_stride = transpose ? BASIS_NQPT : BASIS_ELEMSIZE; 51*0d0321e0SJeremy L Thompson const CeedInt v_stride = transpose ? BASIS_ELEMSIZE : BASIS_NQPT; 52*0d0321e0SJeremy L Thompson const CeedInt u_comp_stride = nelem * (transpose ? BASIS_NQPT : BASIS_ELEMSIZE); 53*0d0321e0SJeremy L Thompson const CeedInt v_comp_stride = nelem * (transpose ? BASIS_ELEMSIZE : BASIS_NQPT); 54*0d0321e0SJeremy L Thompson const CeedInt u_size = transpose ? BASIS_NQPT : BASIS_ELEMSIZE; 55*0d0321e0SJeremy L Thompson 56*0d0321e0SJeremy L Thompson // Apply basis element by element 57*0d0321e0SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < nelem; elem += gridDim.x) { 58*0d0321e0SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NCOMP; ++comp) { 59*0d0321e0SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + comp * u_comp_stride; 60*0d0321e0SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + comp * v_comp_stride; 61*0d0321e0SJeremy L Thompson for (CeedInt k = i; k < u_size; k += blockDim.x) { 62*0d0321e0SJeremy L Thompson s_buf1[k] = cur_u[k]; 63*0d0321e0SJeremy L Thompson } 64*0d0321e0SJeremy L Thompson CeedInt pre = u_size; 65*0d0321e0SJeremy L Thompson CeedInt post = 1; 66*0d0321e0SJeremy L Thompson for (CeedInt d = 0; d < BASIS_DIM; d++) { 67*0d0321e0SJeremy L Thompson __syncthreads(); 68*0d0321e0SJeremy L Thompson // Update buffers used 69*0d0321e0SJeremy L Thompson pre /= P; 70*0d0321e0SJeremy L Thompson const CeedScalar *in = d % 2 ? s_buf2 : s_buf1; 71*0d0321e0SJeremy L Thompson CeedScalar *out = d == BASIS_DIM - 1 ? cur_v : (d % 2 ? s_buf1 : s_buf2); 72*0d0321e0SJeremy L Thompson 73*0d0321e0SJeremy L Thompson // Contract along middle index 74*0d0321e0SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 75*0d0321e0SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 76*0d0321e0SJeremy L Thompson const CeedInt c = k % post; 77*0d0321e0SJeremy L Thompson const CeedInt j = (k / post) % Q; 78*0d0321e0SJeremy L Thompson const CeedInt a = k / (post * Q); 79*0d0321e0SJeremy L Thompson 80*0d0321e0SJeremy L Thompson CeedScalar vk = 0; 81*0d0321e0SJeremy L Thompson for (CeedInt b = 0; b < P; b++) 82*0d0321e0SJeremy L Thompson vk += s_interp1d[j*stride0 + b*stride1] * in[(a*P + b)*post + c]; 83*0d0321e0SJeremy L Thompson 84*0d0321e0SJeremy L Thompson out[k] = vk; 85*0d0321e0SJeremy L Thompson } 86*0d0321e0SJeremy L Thompson 87*0d0321e0SJeremy L Thompson post *= Q; 88*0d0321e0SJeremy L Thompson } 89*0d0321e0SJeremy L Thompson } 90*0d0321e0SJeremy L Thompson } 91*0d0321e0SJeremy L Thompson } 92*0d0321e0SJeremy L Thompson 93*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 94*0d0321e0SJeremy L Thompson // Grad 95*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 96*0d0321e0SJeremy L Thompson extern "C" __global__ void grad(const CeedInt nelem, const int transpose, 97*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ interp1d, 98*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ grad1d, 99*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ u, 100*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ v) { 101*0d0321e0SJeremy L Thompson const CeedInt i = threadIdx.x; 102*0d0321e0SJeremy L Thompson 103*0d0321e0SJeremy L Thompson __shared__ CeedScalar s_mem[2 * (BASIS_Q1D * BASIS_P1D + BASIS_BUF_LEN)]; 104*0d0321e0SJeremy L Thompson CeedScalar *s_interp1d = s_mem; 105*0d0321e0SJeremy L Thompson CeedScalar *s_grad1d = s_interp1d + BASIS_Q1D * BASIS_P1D; 106*0d0321e0SJeremy L Thompson CeedScalar *s_buf1 = s_grad1d + BASIS_Q1D * BASIS_P1D; 107*0d0321e0SJeremy L Thompson CeedScalar *s_buf2 = s_buf1 + BASIS_BUF_LEN; 108*0d0321e0SJeremy L Thompson for (CeedInt k = i; k < BASIS_Q1D * BASIS_P1D; k += blockDim.x) { 109*0d0321e0SJeremy L Thompson s_interp1d[k] = interp1d[k]; 110*0d0321e0SJeremy L Thompson s_grad1d[k] = grad1d[k]; 111*0d0321e0SJeremy L Thompson } 112*0d0321e0SJeremy L Thompson 113*0d0321e0SJeremy L Thompson const CeedInt P = transpose ? BASIS_Q1D : BASIS_P1D; 114*0d0321e0SJeremy L Thompson const CeedInt Q = transpose ? BASIS_P1D : BASIS_Q1D; 115*0d0321e0SJeremy L Thompson const CeedInt stride0 = transpose ? 1 : BASIS_P1D; 116*0d0321e0SJeremy L Thompson const CeedInt stride1 = transpose ? BASIS_P1D : 1; 117*0d0321e0SJeremy L Thompson const CeedInt u_stride = transpose ? BASIS_NQPT : BASIS_ELEMSIZE; 118*0d0321e0SJeremy L Thompson const CeedInt v_stride = transpose ? BASIS_ELEMSIZE : BASIS_NQPT; 119*0d0321e0SJeremy L Thompson const CeedInt u_comp_stride = nelem * (transpose ? BASIS_NQPT : BASIS_ELEMSIZE); 120*0d0321e0SJeremy L Thompson const CeedInt v_comp_stride = nelem * (transpose ? BASIS_ELEMSIZE : BASIS_NQPT); 121*0d0321e0SJeremy L Thompson const CeedInt u_dim_stride = transpose ? nelem * BASIS_NQPT * BASIS_NCOMP : 0; 122*0d0321e0SJeremy L Thompson const CeedInt v_dim_stride = transpose ? 0 : nelem * BASIS_NQPT * BASIS_NCOMP; 123*0d0321e0SJeremy L Thompson 124*0d0321e0SJeremy L Thompson // Apply basis element by element 125*0d0321e0SJeremy L Thompson for (CeedInt elem = blockIdx.x; elem < nelem; elem += gridDim.x) { 126*0d0321e0SJeremy L Thompson for (CeedInt comp = 0; comp < BASIS_NCOMP; ++comp) { 127*0d0321e0SJeremy L Thompson 128*0d0321e0SJeremy L Thompson // dim*dim contractions for grad 129*0d0321e0SJeremy L Thompson for (CeedInt dim1 = 0; dim1 < BASIS_DIM; dim1++) { 130*0d0321e0SJeremy L Thompson CeedInt pre = transpose ? BASIS_NQPT : BASIS_ELEMSIZE; 131*0d0321e0SJeremy L Thompson CeedInt post = 1; 132*0d0321e0SJeremy L Thompson const CeedScalar *cur_u = u + elem * u_stride + dim1 * u_dim_stride + 133*0d0321e0SJeremy L Thompson comp * u_comp_stride; 134*0d0321e0SJeremy L Thompson CeedScalar *cur_v = v + elem * v_stride + dim1 * v_dim_stride + comp * 135*0d0321e0SJeremy L Thompson v_comp_stride; 136*0d0321e0SJeremy L Thompson for (CeedInt dim2 = 0; dim2 < BASIS_DIM; dim2++) { 137*0d0321e0SJeremy L Thompson __syncthreads(); 138*0d0321e0SJeremy L Thompson // Update buffers used 139*0d0321e0SJeremy L Thompson pre /= P; 140*0d0321e0SJeremy L Thompson const CeedScalar *op = dim1 == dim2 ? s_grad1d : s_interp1d; 141*0d0321e0SJeremy L Thompson const CeedScalar *in = dim2 == 0 ? cur_u : (dim2 % 2 ? s_buf2 : s_buf1); 142*0d0321e0SJeremy L Thompson CeedScalar *out = dim2 == BASIS_DIM - 1 ? cur_v : (dim2 % 2 ? s_buf1 : s_buf2); 143*0d0321e0SJeremy L Thompson 144*0d0321e0SJeremy L Thompson // Contract along middle index 145*0d0321e0SJeremy L Thompson const CeedInt writeLen = pre * post * Q; 146*0d0321e0SJeremy L Thompson for (CeedInt k = i; k < writeLen; k += blockDim.x) { 147*0d0321e0SJeremy L Thompson const CeedInt c = k % post; 148*0d0321e0SJeremy L Thompson const CeedInt j = (k / post) % Q; 149*0d0321e0SJeremy L Thompson const CeedInt a = k / (post * Q); 150*0d0321e0SJeremy L Thompson CeedScalar vk = 0; 151*0d0321e0SJeremy L Thompson for (CeedInt b = 0; b < P; b++) 152*0d0321e0SJeremy L Thompson vk += op[j * stride0 + b * stride1] * in[(a * P + b) * post + c]; 153*0d0321e0SJeremy L Thompson 154*0d0321e0SJeremy L Thompson if (transpose && dim2 == BASIS_DIM - 1) 155*0d0321e0SJeremy L Thompson out[k] += vk; 156*0d0321e0SJeremy L Thompson else 157*0d0321e0SJeremy L Thompson out[k] = vk; 158*0d0321e0SJeremy L Thompson } 159*0d0321e0SJeremy L Thompson 160*0d0321e0SJeremy L Thompson post *= Q; 161*0d0321e0SJeremy L Thompson } 162*0d0321e0SJeremy L Thompson } 163*0d0321e0SJeremy L Thompson } 164*0d0321e0SJeremy L Thompson } 165*0d0321e0SJeremy L Thompson } 166*0d0321e0SJeremy L Thompson 167*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 168*0d0321e0SJeremy L Thompson // 1D quadrature weights 169*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 170*0d0321e0SJeremy L Thompson __device__ void weight1d(const CeedInt nelem, const CeedScalar *qweight1d, 171*0d0321e0SJeremy L Thompson CeedScalar *w) { 172*0d0321e0SJeremy L Thompson CeedScalar w1d[BASIS_Q1D]; 173*0d0321e0SJeremy L Thompson for (int i = 0; i < BASIS_Q1D; ++i) 174*0d0321e0SJeremy L Thompson w1d[i] = qweight1d[i]; 175*0d0321e0SJeremy L Thompson 176*0d0321e0SJeremy L Thompson for (int e = blockIdx.x * blockDim.x + threadIdx.x; 177*0d0321e0SJeremy L Thompson e < nelem; 178*0d0321e0SJeremy L Thompson e += blockDim.x * gridDim.x) 179*0d0321e0SJeremy L Thompson for (int i = 0; i < BASIS_Q1D; ++i) { 180*0d0321e0SJeremy L Thompson const int ind = e*BASIS_Q1D + i; // sequential 181*0d0321e0SJeremy L Thompson w[ind] = w1d[i]; 182*0d0321e0SJeremy L Thompson } 183*0d0321e0SJeremy L Thompson } 184*0d0321e0SJeremy L Thompson 185*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 186*0d0321e0SJeremy L Thompson // 2D quadrature weights 187*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 188*0d0321e0SJeremy L Thompson __device__ void weight2d(const CeedInt nelem, const CeedScalar *qweight1d, 189*0d0321e0SJeremy L Thompson CeedScalar *w) { 190*0d0321e0SJeremy L Thompson CeedScalar w1d[BASIS_Q1D]; 191*0d0321e0SJeremy L Thompson for (int i = 0; i < BASIS_Q1D; ++i) 192*0d0321e0SJeremy L Thompson w1d[i] = qweight1d[i]; 193*0d0321e0SJeremy L Thompson 194*0d0321e0SJeremy L Thompson for (int e = blockIdx.x * blockDim.x + threadIdx.x; 195*0d0321e0SJeremy L Thompson e < nelem; 196*0d0321e0SJeremy L Thompson e += blockDim.x * gridDim.x) 197*0d0321e0SJeremy L Thompson for (int i = 0; i < BASIS_Q1D; ++i) 198*0d0321e0SJeremy L Thompson for (int j = 0; j < BASIS_Q1D; ++j) { 199*0d0321e0SJeremy L Thompson const int ind = e*BASIS_Q1D*BASIS_Q1D + i + j*BASIS_Q1D; // sequential 200*0d0321e0SJeremy L Thompson w[ind] = w1d[i]*w1d[j]; 201*0d0321e0SJeremy L Thompson } 202*0d0321e0SJeremy L Thompson } 203*0d0321e0SJeremy L Thompson 204*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 205*0d0321e0SJeremy L Thompson // 3D quadrature weights 206*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 207*0d0321e0SJeremy L Thompson __device__ void weight3d(const CeedInt nelem, const CeedScalar *qweight1d, 208*0d0321e0SJeremy L Thompson CeedScalar *w) { 209*0d0321e0SJeremy L Thompson CeedScalar w1d[BASIS_Q1D]; 210*0d0321e0SJeremy L Thompson for (int i = 0; i < BASIS_Q1D; ++i) 211*0d0321e0SJeremy L Thompson w1d[i] = qweight1d[i]; 212*0d0321e0SJeremy L Thompson 213*0d0321e0SJeremy L Thompson for (int e = blockIdx.x * blockDim.x + threadIdx.x; 214*0d0321e0SJeremy L Thompson e < nelem; 215*0d0321e0SJeremy L Thompson e += blockDim.x * gridDim.x) 216*0d0321e0SJeremy L Thompson for (int i = 0; i < BASIS_Q1D; ++i) 217*0d0321e0SJeremy L Thompson for (int j = 0; j < BASIS_Q1D; ++j) 218*0d0321e0SJeremy L Thompson for (int k = 0; k < BASIS_Q1D; ++k) { 219*0d0321e0SJeremy L Thompson const int ind = e*BASIS_Q1D*BASIS_Q1D*BASIS_Q1D + i + j*BASIS_Q1D + 220*0d0321e0SJeremy L Thompson k*BASIS_Q1D*BASIS_Q1D; // sequential 221*0d0321e0SJeremy L Thompson w[ind] = w1d[i]*w1d[j]*w1d[k]; 222*0d0321e0SJeremy L Thompson } 223*0d0321e0SJeremy L Thompson } 224*0d0321e0SJeremy L Thompson 225*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 226*0d0321e0SJeremy L Thompson // Quadrature weights 227*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 228*0d0321e0SJeremy L Thompson extern "C" __global__ void weight(const CeedInt nelem, 229*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ qweight1d, 230*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ v) { 231*0d0321e0SJeremy L Thompson if (BASIS_DIM==1) 232*0d0321e0SJeremy L Thompson weight1d(nelem, qweight1d, v); 233*0d0321e0SJeremy L Thompson else if (BASIS_DIM==2) 234*0d0321e0SJeremy L Thompson weight2d(nelem, qweight1d, v); 235*0d0321e0SJeremy L Thompson else if (BASIS_DIM==3) 236*0d0321e0SJeremy L Thompson weight3d(nelem, qweight1d, v); 237*0d0321e0SJeremy L Thompson } 238*0d0321e0SJeremy L Thompson 239*0d0321e0SJeremy L Thompson ); 240*0d0321e0SJeremy L Thompson 241*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 242*0d0321e0SJeremy L Thompson // Non-Tensor Basis Kernels 243*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 244*0d0321e0SJeremy L Thompson static const char *kernelsNonTensorRef = QUOTE( 245*0d0321e0SJeremy L Thompson 246*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 247*0d0321e0SJeremy L Thompson // Interp 248*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 249*0d0321e0SJeremy L Thompson extern "C" __global__ void interp(const CeedInt nelem, const int transpose, 250*0d0321e0SJeremy L Thompson const CeedScalar *d_B, 251*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ d_U, 252*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ d_V) { 253*0d0321e0SJeremy L Thompson const int tid = threadIdx.x; 254*0d0321e0SJeremy L Thompson 255*0d0321e0SJeremy L Thompson const CeedScalar *U; 256*0d0321e0SJeremy L Thompson CeedScalar V; 257*0d0321e0SJeremy L Thompson //TODO load B in shared memory if blockDim.z > 1? 258*0d0321e0SJeremy L Thompson 259*0d0321e0SJeremy L Thompson for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; 260*0d0321e0SJeremy L Thompson elem += gridDim.x*blockDim.z) { 261*0d0321e0SJeremy L Thompson for (int comp = 0; comp < BASIS_NCOMP; comp++) { 262*0d0321e0SJeremy L Thompson if (!transpose) { // run with Q threads 263*0d0321e0SJeremy L Thompson U = d_U + elem*P + comp*nelem*P; 264*0d0321e0SJeremy L Thompson V = 0.0; 265*0d0321e0SJeremy L Thompson for (int i = 0; i < P; ++i) 266*0d0321e0SJeremy L Thompson V += d_B[i + tid*P]*U[i]; 267*0d0321e0SJeremy L Thompson 268*0d0321e0SJeremy L Thompson d_V[elem*Q + comp*nelem*Q + tid] = V; 269*0d0321e0SJeremy L Thompson } else { // run with P threads 270*0d0321e0SJeremy L Thompson U = d_U + elem*Q + comp*nelem*Q; 271*0d0321e0SJeremy L Thompson V = 0.0; 272*0d0321e0SJeremy L Thompson for (int i = 0; i < Q; ++i) 273*0d0321e0SJeremy L Thompson V += d_B[tid + i*P]*U[i]; 274*0d0321e0SJeremy L Thompson 275*0d0321e0SJeremy L Thompson d_V[elem*P + comp*nelem*P + tid] = V; 276*0d0321e0SJeremy L Thompson } 277*0d0321e0SJeremy L Thompson } 278*0d0321e0SJeremy L Thompson } 279*0d0321e0SJeremy L Thompson } 280*0d0321e0SJeremy L Thompson 281*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 282*0d0321e0SJeremy L Thompson // Grad 283*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 284*0d0321e0SJeremy L Thompson extern "C" __global__ void grad(const CeedInt nelem, const int transpose, 285*0d0321e0SJeremy L Thompson const CeedScalar *d_G, 286*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ d_U, 287*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ d_V) { 288*0d0321e0SJeremy L Thompson const int tid = threadIdx.x; 289*0d0321e0SJeremy L Thompson 290*0d0321e0SJeremy L Thompson const CeedScalar *U; 291*0d0321e0SJeremy L Thompson //TODO load G in shared memory if blockDim.z > 1? 292*0d0321e0SJeremy L Thompson 293*0d0321e0SJeremy L Thompson for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; 294*0d0321e0SJeremy L Thompson elem += gridDim.x*blockDim.z) { 295*0d0321e0SJeremy L Thompson for (int comp=0; comp<BASIS_NCOMP; comp++) { 296*0d0321e0SJeremy L Thompson if (!transpose) { // run with Q threads 297*0d0321e0SJeremy L Thompson CeedScalar V[BASIS_DIM]; 298*0d0321e0SJeremy L Thompson U = d_U + elem*P + comp*nelem*P; 299*0d0321e0SJeremy L Thompson for (int dim = 0; dim < BASIS_DIM; dim++) 300*0d0321e0SJeremy L Thompson V[dim] = 0.0; 301*0d0321e0SJeremy L Thompson 302*0d0321e0SJeremy L Thompson for (int i = 0; i < P; ++i) { 303*0d0321e0SJeremy L Thompson const CeedScalar val = U[i]; 304*0d0321e0SJeremy L Thompson for(int dim = 0; dim < BASIS_DIM; dim++) 305*0d0321e0SJeremy L Thompson V[dim] += d_G[i + tid*P + dim*P*Q]*val; 306*0d0321e0SJeremy L Thompson } 307*0d0321e0SJeremy L Thompson for (int dim = 0; dim < BASIS_DIM; dim++) { 308*0d0321e0SJeremy L Thompson d_V[elem*Q + comp*nelem*Q + dim*BASIS_NCOMP*nelem*Q + tid] = V[dim]; 309*0d0321e0SJeremy L Thompson } 310*0d0321e0SJeremy L Thompson } else { // run with P threads 311*0d0321e0SJeremy L Thompson CeedScalar V = 0.0; 312*0d0321e0SJeremy L Thompson for (int dim = 0; dim < BASIS_DIM; dim++) { 313*0d0321e0SJeremy L Thompson U = d_U + elem*Q + comp*nelem*Q +dim*BASIS_NCOMP*nelem*Q; 314*0d0321e0SJeremy L Thompson for (int i = 0; i < Q; ++i) 315*0d0321e0SJeremy L Thompson V += d_G[tid + i*P + dim*P*Q]*U[i]; 316*0d0321e0SJeremy L Thompson } 317*0d0321e0SJeremy L Thompson d_V[elem*P + comp*nelem*P + tid] = V; 318*0d0321e0SJeremy L Thompson } 319*0d0321e0SJeremy L Thompson } 320*0d0321e0SJeremy L Thompson } 321*0d0321e0SJeremy L Thompson } 322*0d0321e0SJeremy L Thompson 323*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 324*0d0321e0SJeremy L Thompson // Weight 325*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 326*0d0321e0SJeremy L Thompson extern "C" __global__ void weight(const CeedInt nelem, 327*0d0321e0SJeremy L Thompson const CeedScalar *__restrict__ qweight, 328*0d0321e0SJeremy L Thompson CeedScalar *__restrict__ d_V) { 329*0d0321e0SJeremy L Thompson const int tid = threadIdx.x; 330*0d0321e0SJeremy L Thompson //TODO load qweight in shared memory if blockDim.z > 1? 331*0d0321e0SJeremy L Thompson for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < nelem; 332*0d0321e0SJeremy L Thompson elem += gridDim.x*blockDim.z) { 333*0d0321e0SJeremy L Thompson d_V[elem*Q + tid] = qweight[tid]; 334*0d0321e0SJeremy L Thompson } 335*0d0321e0SJeremy L Thompson } 336*0d0321e0SJeremy L Thompson 337*0d0321e0SJeremy L Thompson ); 338*0d0321e0SJeremy L Thompson // *INDENT-ON* 339*0d0321e0SJeremy L Thompson 340*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 341*0d0321e0SJeremy L Thompson // Basis apply - tensor 342*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 343*0d0321e0SJeremy L Thompson int CeedBasisApply_Hip(CeedBasis basis, const CeedInt nelem, 344*0d0321e0SJeremy L Thompson CeedTransposeMode tmode, 345*0d0321e0SJeremy L Thompson CeedEvalMode emode, CeedVector u, CeedVector v) { 346*0d0321e0SJeremy L Thompson int ierr; 347*0d0321e0SJeremy L Thompson Ceed ceed; 348*0d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 349*0d0321e0SJeremy L Thompson Ceed_Hip *ceed_Hip; 350*0d0321e0SJeremy L Thompson ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); 351*0d0321e0SJeremy L Thompson CeedBasis_Hip *data; 352*0d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 353*0d0321e0SJeremy L Thompson const CeedInt transpose = tmode == CEED_TRANSPOSE; 354*0d0321e0SJeremy L Thompson const int maxblocksize = 64; 355*0d0321e0SJeremy L Thompson 356*0d0321e0SJeremy L Thompson // Read vectors 357*0d0321e0SJeremy L Thompson const CeedScalar *d_u; 358*0d0321e0SJeremy L Thompson CeedScalar *d_v; 359*0d0321e0SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { 360*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 361*0d0321e0SJeremy L Thompson } 362*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 363*0d0321e0SJeremy L Thompson 364*0d0321e0SJeremy L Thompson // Clear v for transpose operation 365*0d0321e0SJeremy L Thompson if (tmode == CEED_TRANSPOSE) { 366*0d0321e0SJeremy L Thompson CeedInt length; 367*0d0321e0SJeremy L Thompson ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 368*0d0321e0SJeremy L Thompson ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar)); 369*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed,ierr); 370*0d0321e0SJeremy L Thompson } 371*0d0321e0SJeremy L Thompson 372*0d0321e0SJeremy L Thompson // Basis action 373*0d0321e0SJeremy L Thompson switch (emode) { 374*0d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 375*0d0321e0SJeremy L Thompson void *interpargs[] = {(void *) &nelem, (void *) &transpose, 376*0d0321e0SJeremy L Thompson &data->d_interp1d, &d_u, &d_v 377*0d0321e0SJeremy L Thompson }; 378*0d0321e0SJeremy L Thompson CeedInt Q1d, dim; 379*0d0321e0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints1D(basis, &Q1d); CeedChkBackend(ierr); 380*0d0321e0SJeremy L Thompson ierr = CeedBasisGetDimension(basis, &dim); CeedChkBackend(ierr); 381*0d0321e0SJeremy L Thompson CeedInt blocksize = CeedIntPow(Q1d, dim); 382*0d0321e0SJeremy L Thompson blocksize = blocksize > maxblocksize ? maxblocksize : blocksize; 383*0d0321e0SJeremy L Thompson 384*0d0321e0SJeremy L Thompson ierr = CeedRunKernelHip(ceed, data->interp, nelem, blocksize, interpargs); 385*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 386*0d0321e0SJeremy L Thompson } break; 387*0d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 388*0d0321e0SJeremy L Thompson void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->d_interp1d, 389*0d0321e0SJeremy L Thompson &data->d_grad1d, &d_u, &d_v 390*0d0321e0SJeremy L Thompson }; 391*0d0321e0SJeremy L Thompson CeedInt blocksize = maxblocksize; 392*0d0321e0SJeremy L Thompson 393*0d0321e0SJeremy L Thompson ierr = CeedRunKernelHip(ceed, data->grad, nelem, blocksize, gradargs); 394*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 395*0d0321e0SJeremy L Thompson } break; 396*0d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 397*0d0321e0SJeremy L Thompson void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight1d, &d_v}; 398*0d0321e0SJeremy L Thompson const int blocksize = 64; 399*0d0321e0SJeremy L Thompson int gridsize = nelem/blocksize; 400*0d0321e0SJeremy L Thompson if (blocksize * gridsize < nelem) 401*0d0321e0SJeremy L Thompson gridsize += 1; 402*0d0321e0SJeremy L Thompson 403*0d0321e0SJeremy L Thompson ierr = CeedRunKernelHip(ceed, data->weight, gridsize, blocksize, 404*0d0321e0SJeremy L Thompson weightargs); CeedChkBackend(ierr); 405*0d0321e0SJeremy L Thompson } break; 406*0d0321e0SJeremy L Thompson // LCOV_EXCL_START 407*0d0321e0SJeremy L Thompson // Evaluate the divergence to/from the quadrature points 408*0d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 409*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 410*0d0321e0SJeremy L Thompson // Evaluate the curl to/from the quadrature points 411*0d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 412*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 413*0d0321e0SJeremy L Thompson // Take no action, BasisApply should not have been called 414*0d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 415*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 416*0d0321e0SJeremy L Thompson "CEED_EVAL_NONE does not make sense in this context"); 417*0d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 418*0d0321e0SJeremy L Thompson } 419*0d0321e0SJeremy L Thompson 420*0d0321e0SJeremy L Thompson // Restore vectors 421*0d0321e0SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { 422*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 423*0d0321e0SJeremy L Thompson } 424*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 425*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 426*0d0321e0SJeremy L Thompson } 427*0d0321e0SJeremy L Thompson 428*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 429*0d0321e0SJeremy L Thompson // Basis apply - non-tensor 430*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 431*0d0321e0SJeremy L Thompson int CeedBasisApplyNonTensor_Hip(CeedBasis basis, const CeedInt nelem, 432*0d0321e0SJeremy L Thompson CeedTransposeMode tmode, CeedEvalMode emode, 433*0d0321e0SJeremy L Thompson CeedVector u, CeedVector v) { 434*0d0321e0SJeremy L Thompson int ierr; 435*0d0321e0SJeremy L Thompson Ceed ceed; 436*0d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 437*0d0321e0SJeremy L Thompson Ceed_Hip *ceed_Hip; 438*0d0321e0SJeremy L Thompson ierr = CeedGetData(ceed, &ceed_Hip); CeedChkBackend(ierr); 439*0d0321e0SJeremy L Thompson CeedBasisNonTensor_Hip *data; 440*0d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 441*0d0321e0SJeremy L Thompson CeedInt nnodes, nqpt; 442*0d0321e0SJeremy L Thompson ierr = CeedBasisGetNumQuadraturePoints(basis, &nqpt); CeedChkBackend(ierr); 443*0d0321e0SJeremy L Thompson ierr = CeedBasisGetNumNodes(basis, &nnodes); CeedChkBackend(ierr); 444*0d0321e0SJeremy L Thompson const CeedInt transpose = tmode == CEED_TRANSPOSE; 445*0d0321e0SJeremy L Thompson int elemsPerBlock = 1; 446*0d0321e0SJeremy L Thompson int grid = nelem/elemsPerBlock+((nelem/elemsPerBlock*elemsPerBlock<nelem)?1:0); 447*0d0321e0SJeremy L Thompson 448*0d0321e0SJeremy L Thompson // Read vectors 449*0d0321e0SJeremy L Thompson const CeedScalar *d_u; 450*0d0321e0SJeremy L Thompson CeedScalar *d_v; 451*0d0321e0SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { 452*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &d_u); CeedChkBackend(ierr); 453*0d0321e0SJeremy L Thompson } 454*0d0321e0SJeremy L Thompson ierr = CeedVectorGetArrayWrite(v, CEED_MEM_DEVICE, &d_v); CeedChkBackend(ierr); 455*0d0321e0SJeremy L Thompson 456*0d0321e0SJeremy L Thompson // Clear v for transpose operation 457*0d0321e0SJeremy L Thompson if (tmode == CEED_TRANSPOSE) { 458*0d0321e0SJeremy L Thompson CeedInt length; 459*0d0321e0SJeremy L Thompson ierr = CeedVectorGetLength(v, &length); CeedChkBackend(ierr); 460*0d0321e0SJeremy L Thompson ierr = hipMemset(d_v, 0, length * sizeof(CeedScalar)); 461*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 462*0d0321e0SJeremy L Thompson } 463*0d0321e0SJeremy L Thompson 464*0d0321e0SJeremy L Thompson // Apply basis operation 465*0d0321e0SJeremy L Thompson switch (emode) { 466*0d0321e0SJeremy L Thompson case CEED_EVAL_INTERP: { 467*0d0321e0SJeremy L Thompson void *interpargs[] = {(void *) &nelem, (void *) &transpose, 468*0d0321e0SJeremy L Thompson &data->d_interp, &d_u, &d_v 469*0d0321e0SJeremy L Thompson }; 470*0d0321e0SJeremy L Thompson if (!transpose) { 471*0d0321e0SJeremy L Thompson ierr = CeedRunKernelDimHip(ceed, data->interp, grid, nqpt, 1, 472*0d0321e0SJeremy L Thompson elemsPerBlock, interpargs); CeedChkBackend(ierr); 473*0d0321e0SJeremy L Thompson } else { 474*0d0321e0SJeremy L Thompson ierr = CeedRunKernelDimHip(ceed, data->interp, grid, nnodes, 1, 475*0d0321e0SJeremy L Thompson elemsPerBlock, interpargs); CeedChkBackend(ierr); 476*0d0321e0SJeremy L Thompson } 477*0d0321e0SJeremy L Thompson } break; 478*0d0321e0SJeremy L Thompson case CEED_EVAL_GRAD: { 479*0d0321e0SJeremy L Thompson void *gradargs[] = {(void *) &nelem, (void *) &transpose, &data->d_grad, 480*0d0321e0SJeremy L Thompson &d_u, &d_v 481*0d0321e0SJeremy L Thompson }; 482*0d0321e0SJeremy L Thompson if (!transpose) { 483*0d0321e0SJeremy L Thompson ierr = CeedRunKernelDimHip(ceed, data->grad, grid, nqpt, 1, 484*0d0321e0SJeremy L Thompson elemsPerBlock, gradargs); CeedChkBackend(ierr); 485*0d0321e0SJeremy L Thompson } else { 486*0d0321e0SJeremy L Thompson ierr = CeedRunKernelDimHip(ceed, data->grad, grid, nnodes, 1, 487*0d0321e0SJeremy L Thompson elemsPerBlock, gradargs); CeedChkBackend(ierr); 488*0d0321e0SJeremy L Thompson } 489*0d0321e0SJeremy L Thompson } break; 490*0d0321e0SJeremy L Thompson case CEED_EVAL_WEIGHT: { 491*0d0321e0SJeremy L Thompson void *weightargs[] = {(void *) &nelem, (void *) &data->d_qweight, &d_v}; 492*0d0321e0SJeremy L Thompson ierr = CeedRunKernelDimHip(ceed, data->weight, grid, nqpt, 1, 493*0d0321e0SJeremy L Thompson elemsPerBlock, weightargs); CeedChkBackend(ierr); 494*0d0321e0SJeremy L Thompson } break; 495*0d0321e0SJeremy L Thompson // LCOV_EXCL_START 496*0d0321e0SJeremy L Thompson // Evaluate the divergence to/from the quadrature points 497*0d0321e0SJeremy L Thompson case CEED_EVAL_DIV: 498*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_DIV not supported"); 499*0d0321e0SJeremy L Thompson // Evaluate the curl to/from the quadrature points 500*0d0321e0SJeremy L Thompson case CEED_EVAL_CURL: 501*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, "CEED_EVAL_CURL not supported"); 502*0d0321e0SJeremy L Thompson // Take no action, BasisApply should not have been called 503*0d0321e0SJeremy L Thompson case CEED_EVAL_NONE: 504*0d0321e0SJeremy L Thompson return CeedError(ceed, CEED_ERROR_BACKEND, 505*0d0321e0SJeremy L Thompson "CEED_EVAL_NONE does not make sense in this context"); 506*0d0321e0SJeremy L Thompson // LCOV_EXCL_STOP 507*0d0321e0SJeremy L Thompson } 508*0d0321e0SJeremy L Thompson 509*0d0321e0SJeremy L Thompson // Restore vectors 510*0d0321e0SJeremy L Thompson if (emode != CEED_EVAL_WEIGHT) { 511*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArrayRead(u, &d_u); CeedChkBackend(ierr); 512*0d0321e0SJeremy L Thompson } 513*0d0321e0SJeremy L Thompson ierr = CeedVectorRestoreArray(v, &d_v); CeedChkBackend(ierr); 514*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 515*0d0321e0SJeremy L Thompson } 516*0d0321e0SJeremy L Thompson 517*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 518*0d0321e0SJeremy L Thompson // Destroy tensor basis 519*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 520*0d0321e0SJeremy L Thompson static int CeedBasisDestroy_Hip(CeedBasis basis) { 521*0d0321e0SJeremy L Thompson int ierr; 522*0d0321e0SJeremy L Thompson Ceed ceed; 523*0d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 524*0d0321e0SJeremy L Thompson 525*0d0321e0SJeremy L Thompson CeedBasis_Hip *data; 526*0d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 527*0d0321e0SJeremy L Thompson 528*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, hipModuleUnload(data->module)); 529*0d0321e0SJeremy L Thompson 530*0d0321e0SJeremy L Thompson ierr = hipFree(data->d_qweight1d); CeedChk_Hip(ceed,ierr); 531*0d0321e0SJeremy L Thompson ierr = hipFree(data->d_interp1d); CeedChk_Hip(ceed,ierr); 532*0d0321e0SJeremy L Thompson ierr = hipFree(data->d_grad1d); CeedChk_Hip(ceed,ierr); 533*0d0321e0SJeremy L Thompson 534*0d0321e0SJeremy L Thompson ierr = CeedFree(&data); CeedChkBackend(ierr); 535*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 536*0d0321e0SJeremy L Thompson } 537*0d0321e0SJeremy L Thompson 538*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 539*0d0321e0SJeremy L Thompson // Destroy non-tensor basis 540*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 541*0d0321e0SJeremy L Thompson static int CeedBasisDestroyNonTensor_Hip(CeedBasis basis) { 542*0d0321e0SJeremy L Thompson int ierr; 543*0d0321e0SJeremy L Thompson Ceed ceed; 544*0d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 545*0d0321e0SJeremy L Thompson 546*0d0321e0SJeremy L Thompson CeedBasisNonTensor_Hip *data; 547*0d0321e0SJeremy L Thompson ierr = CeedBasisGetData(basis, &data); CeedChkBackend(ierr); 548*0d0321e0SJeremy L Thompson 549*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, hipModuleUnload(data->module)); 550*0d0321e0SJeremy L Thompson 551*0d0321e0SJeremy L Thompson ierr = hipFree(data->d_qweight); CeedChk_Hip(ceed, ierr); 552*0d0321e0SJeremy L Thompson ierr = hipFree(data->d_interp); CeedChk_Hip(ceed, ierr); 553*0d0321e0SJeremy L Thompson ierr = hipFree(data->d_grad); CeedChk_Hip(ceed, ierr); 554*0d0321e0SJeremy L Thompson 555*0d0321e0SJeremy L Thompson ierr = CeedFree(&data); CeedChkBackend(ierr); 556*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 557*0d0321e0SJeremy L Thompson } 558*0d0321e0SJeremy L Thompson 559*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 560*0d0321e0SJeremy L Thompson // Create tensor 561*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 562*0d0321e0SJeremy L Thompson int CeedBasisCreateTensorH1_Hip(CeedInt dim, CeedInt P1d, CeedInt Q1d, 563*0d0321e0SJeremy L Thompson const CeedScalar *interp1d, 564*0d0321e0SJeremy L Thompson const CeedScalar *grad1d, 565*0d0321e0SJeremy L Thompson const CeedScalar *qref1d, 566*0d0321e0SJeremy L Thompson const CeedScalar *qweight1d, 567*0d0321e0SJeremy L Thompson CeedBasis basis) { 568*0d0321e0SJeremy L Thompson int ierr; 569*0d0321e0SJeremy L Thompson Ceed ceed; 570*0d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 571*0d0321e0SJeremy L Thompson CeedBasis_Hip *data; 572*0d0321e0SJeremy L Thompson ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 573*0d0321e0SJeremy L Thompson 574*0d0321e0SJeremy L Thompson // Copy data to GPU 575*0d0321e0SJeremy L Thompson const CeedInt qBytes = Q1d * sizeof(CeedScalar); 576*0d0321e0SJeremy L Thompson ierr = hipMalloc((void **)&data->d_qweight1d, qBytes); CeedChk_Hip(ceed,ierr); 577*0d0321e0SJeremy L Thompson ierr = hipMemcpy(data->d_qweight1d, qweight1d, qBytes, 578*0d0321e0SJeremy L Thompson hipMemcpyHostToDevice); CeedChk_Hip(ceed,ierr); 579*0d0321e0SJeremy L Thompson 580*0d0321e0SJeremy L Thompson const CeedInt iBytes = qBytes * P1d; 581*0d0321e0SJeremy L Thompson ierr = hipMalloc((void **)&data->d_interp1d, iBytes); CeedChk_Hip(ceed,ierr); 582*0d0321e0SJeremy L Thompson ierr = hipMemcpy(data->d_interp1d, interp1d, iBytes, 583*0d0321e0SJeremy L Thompson hipMemcpyHostToDevice); CeedChk_Hip(ceed,ierr); 584*0d0321e0SJeremy L Thompson 585*0d0321e0SJeremy L Thompson ierr = hipMalloc((void **)&data->d_grad1d, iBytes); CeedChk_Hip(ceed,ierr); 586*0d0321e0SJeremy L Thompson ierr = hipMemcpy(data->d_grad1d, grad1d, iBytes, 587*0d0321e0SJeremy L Thompson hipMemcpyHostToDevice); CeedChk_Hip(ceed,ierr); 588*0d0321e0SJeremy L Thompson 589*0d0321e0SJeremy L Thompson // Complie basis kernels 590*0d0321e0SJeremy L Thompson CeedInt ncomp; 591*0d0321e0SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 592*0d0321e0SJeremy L Thompson ierr = CeedCompileHip(ceed, basiskernels, &data->module, 7, 593*0d0321e0SJeremy L Thompson "BASIS_Q1D", Q1d, 594*0d0321e0SJeremy L Thompson "BASIS_P1D", P1d, 595*0d0321e0SJeremy L Thompson "BASIS_BUF_LEN", ncomp * CeedIntPow(Q1d > P1d ? 596*0d0321e0SJeremy L Thompson Q1d : P1d, dim), 597*0d0321e0SJeremy L Thompson "BASIS_DIM", dim, 598*0d0321e0SJeremy L Thompson "BASIS_NCOMP", ncomp, 599*0d0321e0SJeremy L Thompson "BASIS_ELEMSIZE", CeedIntPow(P1d, dim), 600*0d0321e0SJeremy L Thompson "BASIS_NQPT", CeedIntPow(Q1d, dim) 601*0d0321e0SJeremy L Thompson ); CeedChkBackend(ierr); 602*0d0321e0SJeremy L Thompson ierr = CeedGetKernelHip(ceed, data->module, "interp", &data->interp); 603*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 604*0d0321e0SJeremy L Thompson ierr = CeedGetKernelHip(ceed, data->module, "grad", &data->grad); 605*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 606*0d0321e0SJeremy L Thompson ierr = CeedGetKernelHip(ceed, data->module, "weight", &data->weight); 607*0d0321e0SJeremy L Thompson CeedChkBackend(ierr); 608*0d0321e0SJeremy L Thompson ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 609*0d0321e0SJeremy L Thompson 610*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 611*0d0321e0SJeremy L Thompson CeedBasisApply_Hip); CeedChkBackend(ierr); 612*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 613*0d0321e0SJeremy L Thompson CeedBasisDestroy_Hip); CeedChkBackend(ierr); 614*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 615*0d0321e0SJeremy L Thompson } 616*0d0321e0SJeremy L Thompson 617*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 618*0d0321e0SJeremy L Thompson // Create non-tensor 619*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 620*0d0321e0SJeremy L Thompson int CeedBasisCreateH1_Hip(CeedElemTopology topo, CeedInt dim, CeedInt nnodes, 621*0d0321e0SJeremy L Thompson CeedInt nqpts, const CeedScalar *interp, 622*0d0321e0SJeremy L Thompson const CeedScalar *grad, const CeedScalar *qref, 623*0d0321e0SJeremy L Thompson const CeedScalar *qweight, CeedBasis basis) { 624*0d0321e0SJeremy L Thompson int ierr; 625*0d0321e0SJeremy L Thompson Ceed ceed; 626*0d0321e0SJeremy L Thompson ierr = CeedBasisGetCeed(basis, &ceed); CeedChkBackend(ierr); 627*0d0321e0SJeremy L Thompson CeedBasisNonTensor_Hip *data; 628*0d0321e0SJeremy L Thompson ierr = CeedCalloc(1, &data); CeedChkBackend(ierr); 629*0d0321e0SJeremy L Thompson 630*0d0321e0SJeremy L Thompson // Copy basis data to GPU 631*0d0321e0SJeremy L Thompson const CeedInt qBytes = nqpts * sizeof(CeedScalar); 632*0d0321e0SJeremy L Thompson ierr = hipMalloc((void **)&data->d_qweight, qBytes); CeedChk_Hip(ceed, ierr); 633*0d0321e0SJeremy L Thompson ierr = hipMemcpy(data->d_qweight, qweight, qBytes, 634*0d0321e0SJeremy L Thompson hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 635*0d0321e0SJeremy L Thompson 636*0d0321e0SJeremy L Thompson const CeedInt iBytes = qBytes * nnodes; 637*0d0321e0SJeremy L Thompson ierr = hipMalloc((void **)&data->d_interp, iBytes); CeedChk_Hip(ceed, ierr); 638*0d0321e0SJeremy L Thompson ierr = hipMemcpy(data->d_interp, interp, iBytes, 639*0d0321e0SJeremy L Thompson hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 640*0d0321e0SJeremy L Thompson 641*0d0321e0SJeremy L Thompson const CeedInt gBytes = qBytes * nnodes * dim; 642*0d0321e0SJeremy L Thompson ierr = hipMalloc((void **)&data->d_grad, gBytes); CeedChk_Hip(ceed, ierr); 643*0d0321e0SJeremy L Thompson ierr = hipMemcpy(data->d_grad, grad, gBytes, 644*0d0321e0SJeremy L Thompson hipMemcpyHostToDevice); CeedChk_Hip(ceed, ierr); 645*0d0321e0SJeremy L Thompson 646*0d0321e0SJeremy L Thompson // Compile basis kernels 647*0d0321e0SJeremy L Thompson CeedInt ncomp; 648*0d0321e0SJeremy L Thompson ierr = CeedBasisGetNumComponents(basis, &ncomp); CeedChkBackend(ierr); 649*0d0321e0SJeremy L Thompson ierr = CeedCompileHip(ceed, kernelsNonTensorRef, &data->module, 4, 650*0d0321e0SJeremy L Thompson "Q", nqpts, 651*0d0321e0SJeremy L Thompson "P", nnodes, 652*0d0321e0SJeremy L Thompson "BASIS_DIM", dim, 653*0d0321e0SJeremy L Thompson "BASIS_NCOMP", ncomp 654*0d0321e0SJeremy L Thompson ); CeedChk_Hip(ceed, ierr); 655*0d0321e0SJeremy L Thompson ierr = CeedGetKernelHip(ceed, data->module, "interp", &data->interp); 656*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 657*0d0321e0SJeremy L Thompson ierr = CeedGetKernelHip(ceed, data->module, "grad", &data->grad); 658*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 659*0d0321e0SJeremy L Thompson ierr = CeedGetKernelHip(ceed, data->module, "weight", &data->weight); 660*0d0321e0SJeremy L Thompson CeedChk_Hip(ceed, ierr); 661*0d0321e0SJeremy L Thompson 662*0d0321e0SJeremy L Thompson ierr = CeedBasisSetData(basis, data); CeedChkBackend(ierr); 663*0d0321e0SJeremy L Thompson 664*0d0321e0SJeremy L Thompson // Register backend functions 665*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Apply", 666*0d0321e0SJeremy L Thompson CeedBasisApplyNonTensor_Hip); CeedChkBackend(ierr); 667*0d0321e0SJeremy L Thompson ierr = CeedSetBackendFunction(ceed, "Basis", basis, "Destroy", 668*0d0321e0SJeremy L Thompson CeedBasisDestroyNonTensor_Hip); CeedChkBackend(ierr); 669*0d0321e0SJeremy L Thompson return CEED_ERROR_SUCCESS; 670*0d0321e0SJeremy L Thompson } 671*0d0321e0SJeremy L Thompson //------------------------------------------------------------------------------ 672