1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 10 //------------------------------------------------------------------------------ 11 // Non-Tensor Basis Kernels 12 //------------------------------------------------------------------------------ 13 14 //------------------------------------------------------------------------------ 15 // Interp 16 //------------------------------------------------------------------------------ 17 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, 18 const CeedScalar *d_B, 19 const CeedScalar *__restrict__ d_U, 20 CeedScalar *__restrict__ d_V) { 21 const CeedInt t_id = threadIdx.x; 22 23 const CeedScalar *U; 24 CeedScalar V; 25 //TODO load B in shared memory if blockDim.z > 1? 26 27 for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; 28 elem += gridDim.x*blockDim.z) { 29 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 30 if (transpose) { // run with P threads 31 U = d_U + elem*BASIS_Q + comp*num_elem*BASIS_Q; 32 V = 0.0; 33 for (CeedInt i = 0; i < BASIS_Q; i++) 34 V += d_B[t_id + i*BASIS_P]*U[i]; 35 36 d_V[elem*BASIS_P + comp*num_elem*BASIS_P + t_id] = V; 37 } else { // run with Q threads 38 U = d_U + elem*BASIS_P + comp*num_elem*BASIS_P; 39 V = 0.0; 40 for (CeedInt i = 0; i < BASIS_P; i++) 41 V += d_B[i + t_id*BASIS_P]*U[i]; 42 43 d_V[elem*BASIS_Q + comp*num_elem*BASIS_Q + t_id] = V; 44 } 45 } 46 } 47 } 48 49 //------------------------------------------------------------------------------ 50 // Grad 51 //------------------------------------------------------------------------------ 52 extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, 53 const CeedScalar *d_G, 54 const CeedScalar *__restrict__ d_U, 55 CeedScalar *__restrict__ d_V) { 56 const CeedInt t_id = threadIdx.x; 57 58 const CeedScalar *U; 59 //TODO load G in shared memory if blockDim.z > 1? 60 61 for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; 62 elem += gridDim.x*blockDim.z) { 63 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 64 if (transpose) { // run with P threads 65 CeedScalar V = 0.0; 66 for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { 67 U = d_U + elem*BASIS_Q + comp*num_elem*BASIS_Q + 68 dim*BASIS_NUM_COMP*num_elem*BASIS_Q; 69 for (CeedInt i = 0; i < BASIS_Q; i++) 70 V += d_G[t_id + i*BASIS_P + dim*BASIS_P*BASIS_Q]*U[i]; 71 } 72 d_V[elem*BASIS_P + comp*num_elem*BASIS_P + t_id] = V; 73 } else { // run with Q threads 74 CeedScalar V[BASIS_DIM]; 75 U = d_U + elem*BASIS_P + comp*num_elem*BASIS_P; 76 for (CeedInt dim = 0; dim < BASIS_DIM; dim++) 77 V[dim] = 0.0; 78 79 for (CeedInt i = 0; i < BASIS_P; i++) { 80 const CeedScalar val = U[i]; 81 for(CeedInt dim = 0; dim < BASIS_DIM; dim++) 82 V[dim] += d_G[i + t_id*BASIS_P + dim*BASIS_P*BASIS_Q]*val; 83 } 84 for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { 85 d_V[elem*BASIS_Q + comp*num_elem*BASIS_Q + 86 dim*BASIS_NUM_COMP*num_elem*BASIS_Q + t_id] = V[dim]; 87 } 88 } 89 } 90 } 91 } 92 93 //------------------------------------------------------------------------------ 94 // Weight 95 //------------------------------------------------------------------------------ 96 extern "C" __global__ void Weight(const CeedInt num_elem, 97 const CeedScalar *__restrict__ qweight, 98 CeedScalar *__restrict__ d_V) { 99 const CeedInt t_id = threadIdx.x; 100 //TODO load qweight in shared memory if blockDim.z > 1? 101 for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; 102 elem += gridDim.x*blockDim.z) { 103 d_V[elem*BASIS_Q + t_id] = qweight[t_id]; 104 } 105 } 106 107 //------------------------------------------------------------------------------ 108