1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for MAGMA non-tensor basis interpolation 10 #ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 11 #define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 12 13 #include "magma-common-nontensor.h" 14 15 //////////////////////////////////////////////////////////////////////////////// 16 template <typename T, int Q_COMP, int P, int Q, int NB> 17 static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 18 CeedScalar *shared_data) { 19 const int tx = threadIdx.x; 20 const int ty = threadIdx.y; 21 const int id = blockIdx.x * blockDim.y + ty; 22 const int nblocks = (n + NB - 1) / NB; 23 const int myn = min(NB, n - id * NB); 24 25 dB += id * P * NB; 26 dC += id * Q * NB; 27 28 // A is P x Q 29 CeedScalar *sB = shared_data + ty * P * NB; 30 CeedScalar *sA = shared_data + blockDim.y * P * NB; 31 32 // read B once for all C's 33 if (id < nblocks) { 34 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 35 } 36 37 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 38 for (int d = 0; d < Q_COMP; d++) { 39 // read A using all threads 40 CeedScalar rA[P]; 41 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 42 43 CeedScalar rC[NB]; 44 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 45 46 // write C 47 if (id < nblocks) { 48 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 49 } 50 51 dA += Q * P; 52 dC += Q * n; 53 54 __syncthreads(); 55 } 56 } 57 58 //////////////////////////////////////////////////////////////////////////////// 59 template <typename T, int Q_COMP, int P, int Q, int NB> 60 static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 61 CeedScalar *shared_data) { 62 const int tx = threadIdx.x; 63 const int ty = threadIdx.y; 64 const int id = blockIdx.x * blockDim.y + ty; 65 const int nblocks = (n + NB - 1) / NB; 66 const int myn = min(NB, n - id * NB); 67 68 dB += id * Q * NB; 69 dC += id * P * NB; 70 71 // A is P x Q 72 CeedScalar *sA = shared_data; 73 CeedScalar *sB = shared_data + ty * Q * NB; 74 75 CeedScalar rC[NB] = {0.0}; 76 77 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 78 for (int d = 0; d < Q_COMP; d++) { 79 // read A using all threads 80 CeedScalar rA[Q]; 81 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 82 __syncthreads(); 83 84 // read B 85 if (id < nblocks) { 86 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 87 } 88 __syncthreads(); 89 90 addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 91 92 dA += P * Q; 93 dB += Q * n; 94 95 __syncthreads(); 96 } 97 98 // write C 99 if (id < nblocks) { 100 write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 101 } 102 } 103 104 //////////////////////////////////////////////////////////////////////////////// 105 template <typename T, int P, int Q, int NB> 106 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 107 CeedScalar *shared_data) { 108 const int tx = threadIdx.x; 109 const int ty = threadIdx.y; 110 const int id = blockIdx.x * blockDim.y + ty; 111 const int nblocks = (n + NB - 1) / NB; 112 const int myn = min(NB, n - id * NB); 113 114 dB += id * P * NB; 115 dC += id * Q * NB; 116 117 // A is P x Q 118 CeedScalar *sA = shared_data; 119 CeedScalar *sB = shared_data + ty * P * NB; 120 121 // read A using all threads 122 CeedScalar rA[P]; 123 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 124 __syncthreads(); 125 126 // terminate threads with no work 127 if (id >= nblocks) return; 128 129 // read B 130 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 131 __syncthreads(); 132 133 CeedScalar rC[NB]; 134 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 135 136 // write C 137 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 138 } 139 140 //////////////////////////////////////////////////////////////////////////////// 141 template <typename T, int P, int Q, int NB> 142 static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 143 CeedScalar *shared_data) { 144 const int tx = threadIdx.x; 145 const int ty = threadIdx.y; 146 const int id = blockIdx.x * blockDim.y + ty; 147 const int nblocks = (n + NB - 1) / NB; 148 const int myn = min(NB, n - id * NB); 149 150 dB += id * Q * NB; 151 dC += id * P * NB; 152 153 // A is P x Q 154 CeedScalar *sA = shared_data; 155 CeedScalar *sB = shared_data + ty * Q * NB; 156 157 // read A using all threads 158 CeedScalar rA[Q]; 159 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 160 __syncthreads(); 161 162 // terminate threads with no work 163 if (id >= nblocks) return; 164 165 // read B 166 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 167 __syncthreads(); 168 169 CeedScalar rC[NB]; 170 mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 171 172 // write C 173 write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 174 } 175 176 //////////////////////////////////////////////////////////////////////////////// 177 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 178 void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 179 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 180 181 #if BASIS_Q_COMP_INTERP == 1 182 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 183 #else 184 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 185 #endif 186 } 187 188 //////////////////////////////////////////////////////////////////////////////// 189 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 190 void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 191 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 192 193 #if BASIS_Q_COMP_INTERP == 1 194 magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 195 #else 196 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 197 #endif 198 } 199 200 //////////////////////////////////////////////////////////////////////////////// 201 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 202 void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 203 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 204 205 #if BASIS_Q_COMP_DERIV == 1 206 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 207 #else 208 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 209 #endif 210 } 211 212 //////////////////////////////////////////////////////////////////////////////// 213 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 214 void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 215 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 216 217 #if BASIS_Q_COMP_DERIV == 1 218 magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 219 #else 220 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 221 #endif 222 } 223 224 #endif // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 225