1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 #include <ceed.h> 9 10 //------------------------------------------------------------------------------ 11 // Non-Tensor Basis Kernels 12 //------------------------------------------------------------------------------ 13 14 //------------------------------------------------------------------------------ 15 // Interp 16 //------------------------------------------------------------------------------ 17 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_B, const CeedScalar *__restrict__ d_U, 18 CeedScalar *__restrict__ d_V) { 19 const CeedInt t_id = threadIdx.x; 20 21 const CeedScalar *U; 22 CeedScalar V; 23 // TODO load B in shared memory if blockDim.z > 1? 24 25 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 26 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 27 if (transpose) { // run with P threads 28 U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q; 29 V = 0.0; 30 for (CeedInt i = 0; i < BASIS_Q; i++) V += d_B[t_id + i * BASIS_P] * U[i]; 31 32 d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; 33 } else { // run with Q threads 34 U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; 35 V = 0.0; 36 for (CeedInt i = 0; i < BASIS_P; i++) V += d_B[i + t_id * BASIS_P] * U[i]; 37 38 d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + t_id] = V; 39 } 40 } 41 } 42 } 43 44 //------------------------------------------------------------------------------ 45 // Grad 46 //------------------------------------------------------------------------------ 47 extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_G, const CeedScalar *__restrict__ d_U, 48 CeedScalar *__restrict__ d_V) { 49 const CeedInt t_id = threadIdx.x; 50 51 const CeedScalar *U; 52 // TODO load G in shared memory if blockDim.z > 1? 53 54 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 55 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 56 if (transpose) { // run with P threads 57 CeedScalar V = 0.0; 58 for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { 59 U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q; 60 for (CeedInt i = 0; i < BASIS_Q; i++) V += d_G[t_id + i * BASIS_P + dim * BASIS_P * BASIS_Q] * U[i]; 61 } 62 63 d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; 64 } else { // run with Q threads 65 CeedScalar V[BASIS_DIM]; 66 U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; 67 for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] = 0.0; 68 for (CeedInt i = 0; i < BASIS_P; i++) { 69 const CeedScalar val = U[i]; 70 for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] += d_G[i + t_id * BASIS_P + dim * BASIS_P * BASIS_Q] * val; 71 } 72 73 for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { 74 d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q + t_id] = V[dim]; 75 } 76 } 77 } 78 } 79 } 80 81 //------------------------------------------------------------------------------ 82 // Weight 83 //------------------------------------------------------------------------------ 84 extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_V) { 85 const CeedInt t_id = threadIdx.x; 86 // TODO load q_weight in shared memory if blockDim.z > 1? 87 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 88 d_V[elem * BASIS_Q + t_id] = q_weight[t_id]; 89 } 90 } 91 92 //------------------------------------------------------------------------------ 93