1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for MAGMA non-tensor basis interpolation 10 11 #include "magma-common-nontensor.h" 12 13 //////////////////////////////////////////////////////////////////////////////// 14 template <typename T, int Q_COMP, int P, int Q, int NB> 15 static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 16 CeedScalar *shared_data) { 17 const int tx = threadIdx.x; 18 const int ty = threadIdx.y; 19 const int id = blockIdx.x * blockDim.y + ty; 20 const int nblocks = (n + NB - 1) / NB; 21 const int myn = min(NB, n - id * NB); 22 23 dB += id * P * NB; 24 dC += id * Q * NB; 25 26 // A is P x Q 27 CeedScalar *sB = shared_data + ty * P * NB; 28 CeedScalar *sA = shared_data + blockDim.y * P * NB; 29 30 // read B once for all C's 31 if (id < nblocks) { 32 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 33 } 34 35 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 36 for (int d = 0; d < Q_COMP; d++) { 37 // read A using all threads 38 CeedScalar rA[P]; 39 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 40 41 CeedScalar rC[NB]; 42 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 43 44 // write C 45 if (id < nblocks) { 46 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 47 } 48 49 dA += Q * P; 50 dC += Q * n; 51 52 __syncthreads(); 53 } 54 } 55 56 //////////////////////////////////////////////////////////////////////////////// 57 template <typename T, int Q_COMP, int P, int Q, int NB> 58 static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 59 CeedScalar *shared_data) { 60 const int tx = threadIdx.x; 61 const int ty = threadIdx.y; 62 const int id = blockIdx.x * blockDim.y + ty; 63 const int nblocks = (n + NB - 1) / NB; 64 const int myn = min(NB, n - id * NB); 65 66 dB += id * Q * NB; 67 dC += id * P * NB; 68 69 // A is P x Q 70 CeedScalar *sA = shared_data; 71 CeedScalar *sB = shared_data + ty * Q * NB; 72 73 CeedScalar rC[NB] = {0.0}; 74 75 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 76 for (int d = 0; d < Q_COMP; d++) { 77 // read A using all threads 78 CeedScalar rA[Q]; 79 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 80 __syncthreads(); 81 82 // read B 83 if (id < nblocks) { 84 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 85 } 86 __syncthreads(); 87 88 addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 89 90 dA += P * Q; 91 dB += Q * n; 92 93 __syncthreads(); 94 } 95 96 // write C 97 if (id < nblocks) { 98 write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 99 } 100 } 101 102 //////////////////////////////////////////////////////////////////////////////// 103 template <typename T, int Q_COMP, int P, int Q, int NB> 104 static __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC, 105 CeedScalar *shared_data) { 106 const int tx = threadIdx.x; 107 const int ty = threadIdx.y; 108 const int id = blockIdx.x * blockDim.y + ty; 109 const int nblocks = (n + NB - 1) / NB; 110 const int myn = min(NB, n - id * NB); 111 112 dB += id * Q * NB; 113 dC += id * P * NB; 114 115 // A is P x Q 116 CeedScalar *sA = shared_data; 117 CeedScalar *sB = shared_data + ty * Q * NB; 118 119 CeedScalar rC[NB] = {0.0}; 120 121 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 122 for (int d = 0; d < Q_COMP; d++) { 123 // read A using all threads 124 CeedScalar rA[Q]; 125 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 126 __syncthreads(); 127 128 // read B 129 if (id < nblocks) { 130 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 131 } 132 __syncthreads(); 133 134 addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 135 136 dA += P * Q; 137 dB += Q * n; 138 139 __syncthreads(); 140 } 141 142 // sum into C 143 if (id < nblocks) { 144 sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 145 } 146 } 147 148 //////////////////////////////////////////////////////////////////////////////// 149 template <typename T, int P, int Q, int NB> 150 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 151 CeedScalar *shared_data) { 152 const int tx = threadIdx.x; 153 const int ty = threadIdx.y; 154 const int id = blockIdx.x * blockDim.y + ty; 155 const int nblocks = (n + NB - 1) / NB; 156 const int myn = min(NB, n - id * NB); 157 158 dB += id * P * NB; 159 dC += id * Q * NB; 160 161 // A is P x Q 162 CeedScalar *sA = shared_data; 163 CeedScalar *sB = shared_data + ty * P * NB; 164 165 // read A using all threads 166 CeedScalar rA[P]; 167 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 168 __syncthreads(); 169 170 // terminate threads with no work 171 if (id >= nblocks) return; 172 173 // read B 174 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 175 __syncthreads(); 176 177 CeedScalar rC[NB]; 178 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 179 180 // write C 181 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 182 } 183 184 //////////////////////////////////////////////////////////////////////////////// 185 template <typename T, int P, int Q, int NB> 186 static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 187 CeedScalar *shared_data) { 188 const int tx = threadIdx.x; 189 const int ty = threadIdx.y; 190 const int id = blockIdx.x * blockDim.y + ty; 191 const int nblocks = (n + NB - 1) / NB; 192 const int myn = min(NB, n - id * NB); 193 194 dB += id * Q * NB; 195 dC += id * P * NB; 196 197 // A is P x Q 198 CeedScalar *sA = shared_data; 199 CeedScalar *sB = shared_data + ty * Q * NB; 200 201 // read A using all threads 202 CeedScalar rA[Q]; 203 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 204 __syncthreads(); 205 206 // terminate threads with no work 207 if (id >= nblocks) return; 208 209 // read B 210 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 211 __syncthreads(); 212 213 CeedScalar rC[NB]; 214 mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 215 216 // write C 217 write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 218 } 219 220 //////////////////////////////////////////////////////////////////////////////// 221 template <typename T, int P, int Q, int NB> 222 static __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 223 CeedScalar *shared_data) { 224 const int tx = threadIdx.x; 225 const int ty = threadIdx.y; 226 const int id = blockIdx.x * blockDim.y + ty; 227 const int nblocks = (n + NB - 1) / NB; 228 const int myn = min(NB, n - id * NB); 229 230 dB += id * Q * NB; 231 dC += id * P * NB; 232 233 // A is P x Q 234 CeedScalar *sA = shared_data; 235 CeedScalar *sB = shared_data + ty * Q * NB; 236 237 // read A using all threads 238 CeedScalar rA[Q]; 239 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 240 __syncthreads(); 241 242 // terminate threads with no work 243 if (id >= nblocks) return; 244 245 // read B 246 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 247 __syncthreads(); 248 249 CeedScalar rC[NB]; 250 mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 251 252 // sum into C 253 sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 254 } 255 256 //////////////////////////////////////////////////////////////////////////////// 257 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 258 void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 259 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 260 261 #if BASIS_Q_COMP_INTERP == 1 262 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 263 #else 264 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 265 #endif 266 } 267 268 //////////////////////////////////////////////////////////////////////////////// 269 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 270 void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 271 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 272 273 #if BASIS_Q_COMP_INTERP == 1 274 magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 275 #else 276 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 277 #endif 278 } 279 280 //////////////////////////////////////////////////////////////////////////////// 281 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 282 void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 283 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 284 285 #if BASIS_Q_COMP_INTERP == 1 286 magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 287 #else 288 magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 289 #endif 290 } 291 292 //////////////////////////////////////////////////////////////////////////////// 293 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 294 void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 295 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 296 297 #if BASIS_Q_COMP_DERIV == 1 298 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 299 #else 300 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 301 #endif 302 } 303 304 //////////////////////////////////////////////////////////////////////////////// 305 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 306 void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 307 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 308 309 #if BASIS_Q_COMP_DERIV == 1 310 magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 311 #else 312 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 313 #endif 314 } 315 316 //////////////////////////////////////////////////////////////////////////////// 317 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 318 void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 319 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 320 321 #if BASIS_Q_COMP_DERIV == 1 322 magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 323 #else 324 magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 325 #endif 326 } 327