1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP non-tensor product basis templates 10 #ifndef CEED_HIP_REF_BASIS_NONTENSOR_TEMPLATES_H 11 #define CEED_HIP_REF_BASIS_NONTENSOR_TEMPLATES_H 12 13 #include <ceed.h> 14 15 //------------------------------------------------------------------------------ 16 // Tensor contraction 17 //------------------------------------------------------------------------------ 18 template <int NUM_COMP, int Q_COMP, int P, int Q> 19 inline __device__ void Contract(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U, 20 const CeedInt strides_comp_V, const CeedInt strides_q_comp_V, const CeedScalar *__restrict__ d_B, 21 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 22 const CeedInt t_id = threadIdx.x; 23 const CeedScalar *U; 24 CeedScalar r_V[Q_COMP]; 25 // TODO load B in shared memory if blockDim.z > 1? 26 27 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 28 // Run with Q threads 29 U = d_U + elem * strides_elem_U + comp * strides_comp_U; 30 for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] = 0.0; 31 for (CeedInt i = 0; i < P; i++) { 32 const CeedScalar val = U[i]; 33 34 for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] += d_B[i + t_id * P + d * P * Q] * val; 35 } 36 for (CeedInt d = 0; d < Q_COMP; d++) { 37 d_V[elem * strides_elem_V + comp * strides_comp_V + d * strides_q_comp_V + t_id] = r_V[d]; 38 } 39 } 40 } 41 42 //------------------------------------------------------------------------------ 43 // Tensor contraction transpose 44 //------------------------------------------------------------------------------ 45 template <int NUM_COMP, int Q_COMP, int P, int Q> 46 inline __device__ void ContractTranspose(const CeedInt elem, const CeedInt strides_elem_U, const CeedInt strides_elem_V, const CeedInt strides_comp_U, 47 const CeedInt strides_comp_V, const CeedInt strides_q_comp_U, const CeedScalar *__restrict__ d_B, 48 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 49 const CeedInt t_id = threadIdx.x; 50 const CeedScalar *U; 51 CeedScalar r_V; 52 // TODO load B in shared memory if blockDim.z > 1? 53 54 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 55 // Run with P threads 56 r_V = 0.0; 57 for (CeedInt d = 0; d < Q_COMP; d++) { 58 U = d_U + elem * strides_elem_U + comp * strides_comp_U + d * strides_q_comp_U; 59 for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i]; 60 } 61 d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] = r_V; 62 } 63 } 64 65 //------------------------------------------------------------------------------ 66 67 #endif // CEED_HIP_REF_BASIS_NONTENSOR_TEMPLATES_H 68