1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for MAGMA non-tensor basis interpolation 10 #include "magma-common-nontensor.h" 11 12 //////////////////////////////////////////////////////////////////////////////// 13 template <typename T, int Q_COMP, int P, int Q, int NB> 14 static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 15 CeedScalar *shared_data) { 16 const int tx = threadIdx.x; 17 const int ty = threadIdx.y; 18 const int id = blockIdx.x * blockDim.y + ty; 19 const int nblocks = (n + NB - 1) / NB; 20 const int myn = min(NB, n - id * NB); 21 22 dB += id * P * NB; 23 dC += id * Q * NB; 24 25 // A is P x Q 26 CeedScalar *sB = shared_data + ty * P * NB; 27 CeedScalar *sA = shared_data + blockDim.y * P * NB; 28 29 // read B once for all C's 30 if (id < nblocks) { 31 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 32 } 33 34 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 35 for (int d = 0; d < Q_COMP; d++) { 36 // read A using all threads 37 CeedScalar rA[P]; 38 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 39 40 CeedScalar rC[NB]; 41 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 42 43 // write C 44 if (id < nblocks) { 45 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 46 } 47 48 dA += Q * P; 49 dC += Q * n; 50 51 __syncthreads(); 52 } 53 } 54 55 //////////////////////////////////////////////////////////////////////////////// 56 template <typename T, int Q_COMP, int P, int Q, int NB> 57 static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 58 CeedScalar *shared_data) { 59 const int tx = threadIdx.x; 60 const int ty = threadIdx.y; 61 const int id = blockIdx.x * blockDim.y + ty; 62 const int nblocks = (n + NB - 1) / NB; 63 const int myn = min(NB, n - id * NB); 64 65 dB += id * Q * NB; 66 dC += id * P * NB; 67 68 // A is P x Q 69 CeedScalar *sA = shared_data; 70 CeedScalar *sB = shared_data + ty * Q * NB; 71 72 CeedScalar rC[NB] = {0.0}; 73 74 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 75 for (int d = 0; d < Q_COMP; d++) { 76 // read A using all threads 77 CeedScalar rA[Q]; 78 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 79 __syncthreads(); 80 81 // read B 82 if (id < nblocks) { 83 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 84 } 85 __syncthreads(); 86 87 addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 88 89 dA += P * Q; 90 dB += Q * n; 91 92 __syncthreads(); 93 } 94 95 // write C 96 if (id < nblocks) { 97 write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 98 } 99 } 100 101 //////////////////////////////////////////////////////////////////////////////// 102 template <typename T, int Q_COMP, int P, int Q, int NB> 103 static __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC, 104 CeedScalar *shared_data) { 105 const int tx = threadIdx.x; 106 const int ty = threadIdx.y; 107 const int id = blockIdx.x * blockDim.y + ty; 108 const int nblocks = (n + NB - 1) / NB; 109 const int myn = min(NB, n - id * NB); 110 111 dB += id * Q * NB; 112 dC += id * P * NB; 113 114 // A is P x Q 115 CeedScalar *sA = shared_data; 116 CeedScalar *sB = shared_data + ty * Q * NB; 117 118 CeedScalar rC[NB] = {0.0}; 119 120 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 121 for (int d = 0; d < Q_COMP; d++) { 122 // read A using all threads 123 CeedScalar rA[Q]; 124 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 125 __syncthreads(); 126 127 // read B 128 if (id < nblocks) { 129 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 130 } 131 __syncthreads(); 132 133 addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 134 135 dA += P * Q; 136 dB += Q * n; 137 138 __syncthreads(); 139 } 140 141 // sum into C 142 if (id < nblocks) { 143 sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 144 } 145 } 146 147 //////////////////////////////////////////////////////////////////////////////// 148 template <typename T, int P, int Q, int NB> 149 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 150 CeedScalar *shared_data) { 151 const int tx = threadIdx.x; 152 const int ty = threadIdx.y; 153 const int id = blockIdx.x * blockDim.y + ty; 154 const int nblocks = (n + NB - 1) / NB; 155 const int myn = min(NB, n - id * NB); 156 157 dB += id * P * NB; 158 dC += id * Q * NB; 159 160 // A is P x Q 161 CeedScalar *sA = shared_data; 162 CeedScalar *sB = shared_data + ty * P * NB; 163 164 // read A using all threads 165 CeedScalar rA[P]; 166 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 167 __syncthreads(); 168 169 // terminate threads with no work 170 if (id >= nblocks) return; 171 172 // read B 173 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 174 __syncthreads(); 175 176 CeedScalar rC[NB]; 177 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 178 179 // write C 180 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 181 } 182 183 //////////////////////////////////////////////////////////////////////////////// 184 template <typename T, int P, int Q, int NB> 185 static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 186 CeedScalar *shared_data) { 187 const int tx = threadIdx.x; 188 const int ty = threadIdx.y; 189 const int id = blockIdx.x * blockDim.y + ty; 190 const int nblocks = (n + NB - 1) / NB; 191 const int myn = min(NB, n - id * NB); 192 193 dB += id * Q * NB; 194 dC += id * P * NB; 195 196 // A is P x Q 197 CeedScalar *sA = shared_data; 198 CeedScalar *sB = shared_data + ty * Q * NB; 199 200 // read A using all threads 201 CeedScalar rA[Q]; 202 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 203 __syncthreads(); 204 205 // terminate threads with no work 206 if (id >= nblocks) return; 207 208 // read B 209 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 210 __syncthreads(); 211 212 CeedScalar rC[NB]; 213 mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 214 215 // write C 216 write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 217 } 218 219 //////////////////////////////////////////////////////////////////////////////// 220 template <typename T, int P, int Q, int NB> 221 static __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 222 CeedScalar *shared_data) { 223 const int tx = threadIdx.x; 224 const int ty = threadIdx.y; 225 const int id = blockIdx.x * blockDim.y + ty; 226 const int nblocks = (n + NB - 1) / NB; 227 const int myn = min(NB, n - id * NB); 228 229 dB += id * Q * NB; 230 dC += id * P * NB; 231 232 // A is P x Q 233 CeedScalar *sA = shared_data; 234 CeedScalar *sB = shared_data + ty * Q * NB; 235 236 // read A using all threads 237 CeedScalar rA[Q]; 238 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 239 __syncthreads(); 240 241 // terminate threads with no work 242 if (id >= nblocks) return; 243 244 // read B 245 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 246 __syncthreads(); 247 248 CeedScalar rC[NB]; 249 mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 250 251 // sum into C 252 sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 253 } 254 255 //////////////////////////////////////////////////////////////////////////////// 256 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 257 void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 258 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 259 260 #if BASIS_Q_COMP_INTERP == 1 261 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 262 #else 263 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 264 #endif 265 } 266 267 //////////////////////////////////////////////////////////////////////////////// 268 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 269 void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 270 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 271 272 #if BASIS_Q_COMP_INTERP == 1 273 magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 274 #else 275 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 276 #endif 277 } 278 279 //////////////////////////////////////////////////////////////////////////////// 280 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 281 void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 282 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 283 284 #if BASIS_Q_COMP_INTERP == 1 285 magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 286 #else 287 magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 288 #endif 289 } 290 291 //////////////////////////////////////////////////////////////////////////////// 292 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 293 void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 294 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 295 296 #if BASIS_Q_COMP_DERIV == 1 297 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 298 #else 299 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 300 #endif 301 } 302 303 //////////////////////////////////////////////////////////////////////////////// 304 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 305 void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 306 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 307 308 #if BASIS_Q_COMP_DERIV == 1 309 magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 310 #else 311 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 312 #endif 313 } 314 315 //////////////////////////////////////////////////////////////////////////////// 316 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 317 void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 318 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 319 320 #if BASIS_Q_COMP_DERIV == 1 321 magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 322 #else 323 magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 324 #endif 325 } 326