1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP non-tensor product basis 10 #ifndef CEED_HIP_REF_BASIS_NONTENSOR_H 11 #define CEED_HIP_REF_BASIS_NONTENSOR_H 12 13 #include <ceed.h> 14 15 //------------------------------------------------------------------------------ 16 // Non-Tensor Basis Kernels 17 //------------------------------------------------------------------------------ 18 19 //------------------------------------------------------------------------------ 20 // Interp 21 //------------------------------------------------------------------------------ 22 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_B, const CeedScalar *__restrict__ d_U, 23 CeedScalar *__restrict__ d_V) { 24 const CeedInt t_id = threadIdx.x; 25 26 const CeedScalar *U; 27 CeedScalar V; 28 // TODO load B in shared memory if blockDim.z > 1? 29 30 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 31 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 32 if (transpose) { // run with P threads 33 U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q; 34 V = 0.0; 35 for (CeedInt i = 0; i < BASIS_Q; i++) V += d_B[t_id + i * BASIS_P] * U[i]; 36 37 d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; 38 } else { // run with Q threads 39 U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; 40 V = 0.0; 41 for (CeedInt i = 0; i < BASIS_P; i++) V += d_B[i + t_id * BASIS_P] * U[i]; 42 43 d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + t_id] = V; 44 } 45 } 46 } 47 } 48 49 //------------------------------------------------------------------------------ 50 // Grad 51 //------------------------------------------------------------------------------ 52 extern "C" __global__ void Grad(const CeedInt num_elem, const CeedInt transpose, const CeedScalar *d_G, const CeedScalar *__restrict__ d_U, 53 CeedScalar *__restrict__ d_V) { 54 const CeedInt t_id = threadIdx.x; 55 56 const CeedScalar *U; 57 // TODO load G in shared memory if blockDim.z > 1? 58 59 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 60 for (CeedInt comp = 0; comp < BASIS_NUM_COMP; comp++) { 61 if (transpose) { // run with P threads 62 CeedScalar V = 0.0; 63 for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { 64 U = d_U + elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q; 65 for (CeedInt i = 0; i < BASIS_Q; i++) V += d_G[t_id + i * BASIS_P + dim * BASIS_P * BASIS_Q] * U[i]; 66 } 67 68 d_V[elem * BASIS_P + comp * num_elem * BASIS_P + t_id] = V; 69 } else { // run with Q threads 70 CeedScalar V[BASIS_DIM]; 71 U = d_U + elem * BASIS_P + comp * num_elem * BASIS_P; 72 for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] = 0.0; 73 for (CeedInt i = 0; i < BASIS_P; i++) { 74 const CeedScalar val = U[i]; 75 for (CeedInt dim = 0; dim < BASIS_DIM; dim++) V[dim] += d_G[i + t_id * BASIS_P + dim * BASIS_P * BASIS_Q] * val; 76 } 77 78 for (CeedInt dim = 0; dim < BASIS_DIM; dim++) { 79 d_V[elem * BASIS_Q + comp * num_elem * BASIS_Q + dim * BASIS_NUM_COMP * num_elem * BASIS_Q + t_id] = V[dim]; 80 } 81 } 82 } 83 } 84 } 85 86 //------------------------------------------------------------------------------ 87 // Weight 88 //------------------------------------------------------------------------------ 89 extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight, CeedScalar *__restrict__ d_V) { 90 const CeedInt t_id = threadIdx.x; 91 // TODO load q_weight in shared memory if blockDim.z > 1? 92 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 93 d_V[elem * BASIS_Q + t_id] = q_weight[t_id]; 94 } 95 } 96 97 //------------------------------------------------------------------------------ 98 99 #endif // CEED_HIP_REF_BASIS_NONTENSOR_H 100