1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for MAGMA non-tensor basis interpolation 10 #ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 11 #define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 12 13 #include "magma-common-nontensor.h" 14 15 //////////////////////////////////////////////////////////////////////////////// 16 template <typename T, int Q_COMP, int P, int Q, int NB> 17 static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 18 CeedScalar *shared_data) { 19 const int tx = threadIdx.x; 20 const int ty = threadIdx.y; 21 const int id = blockIdx.x * blockDim.y + ty; 22 const int nblocks = (n + NB - 1) / NB; 23 const int myn = min(NB, n - id * NB); 24 25 dB += id * P * NB; 26 dC += id * Q * NB; 27 28 // A is P x Q 29 CeedScalar *sB = shared_data + ty * P * NB; 30 CeedScalar *sA = shared_data + blockDim.y * P * NB; 31 32 // read B once for all C's 33 if (id < nblocks) { 34 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 35 } 36 37 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 38 for (int d = 0; d < Q_COMP; d++) { 39 // init rA, rC 40 CeedScalar rA[P], rC[NB]; 41 42 // read A using all threads 43 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 44 45 if (id < nblocks) { 46 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 47 48 // write C 49 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 50 } 51 52 dA += Q * P; 53 dC += Q * n; 54 55 __syncthreads(); 56 } 57 } 58 59 //////////////////////////////////////////////////////////////////////////////// 60 template <typename T, int P, int Q, int NB> 61 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 62 CeedScalar *shared_data) { 63 const int tx = threadIdx.x; 64 const int ty = threadIdx.y; 65 const int id = blockIdx.x * blockDim.y + ty; 66 const int nblocks = (n + NB - 1) / NB; 67 const int myn = min(NB, n - id * NB); 68 69 dB += id * P * NB; 70 dC += id * Q * NB; 71 72 // A is P x Q 73 CeedScalar *sA = shared_data; 74 CeedScalar *sB = shared_data + ty * P * NB; 75 76 // init rA, rC 77 CeedScalar rA[P], rC[NB]; 78 79 // read A using all threads 80 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 81 __syncthreads(); 82 83 // terminate threads with no work 84 if (id >= nblocks) return; 85 86 // read B once for all C's 87 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 88 __syncthreads(); 89 90 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 91 92 // write C 93 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 94 } 95 96 //////////////////////////////////////////////////////////////////////////////// 97 template <typename T, int Q_COMP, int P, int Q, int NB> 98 static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 99 CeedScalar *shared_data) { 100 const int tx = threadIdx.x; 101 const int ty = threadIdx.y; 102 const int id = blockIdx.x * blockDim.y + ty; 103 const int nblocks = (n + NB - 1) / NB; 104 const int myn = min(NB, n - id * NB); 105 106 // terminate threads with no work 107 if (id >= nblocks) return; 108 109 dB += id * Q * NB; 110 dC += id * P * NB; 111 112 // A is P x Q 113 CeedScalar *sB = shared_data + ty * Q * NB; 114 115 // init rC 116 CeedScalar rC[NB] = {0.0}; 117 118 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 119 for (int d = 0; d < Q_COMP; d++) { 120 // init rA 121 CeedScalar rA[Q]; 122 123 // read A 124 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, NB>(tx, dA, rA); 125 126 // read B 127 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 128 __syncthreads(); 129 130 addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 131 132 dA += P * Q; 133 dB += Q * n; 134 135 __syncthreads(); 136 } 137 138 // write C 139 write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 140 } 141 142 //////////////////////////////////////////////////////////////////////////////// 143 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 144 void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 145 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 146 147 if (BASIS_Q_COMP_INTERP == 1) { 148 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 149 } else { 150 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 151 } 152 } 153 154 //////////////////////////////////////////////////////////////////////////////// 155 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 156 void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 157 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 158 159 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 160 } 161 162 //////////////////////////////////////////////////////////////////////////////// 163 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 164 void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 165 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 166 167 if (BASIS_Q_COMP_DERIV == 1) { 168 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 169 } else { 170 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 171 } 172 } 173 174 //////////////////////////////////////////////////////////////////////////////// 175 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 176 void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 177 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 178 179 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 180 } 181 182 #endif // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 183