1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for MAGMA non-tensor basis interpolation 10 11 #include "magma-common-nontensor.h" 12 13 //////////////////////////////////////////////////////////////////////////////// 14 template <typename T, int Q_COMP, int P, int Q, int NB> 15 static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 16 CeedScalar *shared_data) { 17 const int tx = threadIdx.x; 18 const int ty = threadIdx.y; 19 const int id = blockIdx.x * blockDim.y + ty; 20 const int nblocks = (n + NB - 1) / NB; 21 const int myn = min(NB, n - id * NB); 22 23 dB += id * P * NB; 24 dC += id * Q * NB; 25 26 // A is P x Q 27 CeedScalar *sB = shared_data + ty * P * NB; 28 CeedScalar *sA = shared_data + blockDim.y * P * NB; 29 30 // read B once for all C's 31 if (id < nblocks) { 32 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 33 } 34 35 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 36 for (int d = 0; d < Q_COMP; d++) { 37 // read A using all threads 38 CeedScalar rA[P]; 39 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 40 41 CeedScalar rC[NB]; 42 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 43 44 // write C 45 if (id < nblocks) { 46 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 47 } 48 49 dA += Q * P; 50 dC += Q * n; 51 52 __syncthreads(); 53 } 54 } 55 56 //////////////////////////////////////////////////////////////////////////////// 57 template <typename T, int Q_COMP, int P, int Q, int NB> 58 static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 59 CeedScalar *shared_data) { 60 const int tx = threadIdx.x; 61 const int ty = threadIdx.y; 62 const int id = blockIdx.x * blockDim.y + ty; 63 const int nblocks = (n + NB - 1) / NB; 64 const int myn = min(NB, n - id * NB); 65 66 dB += id * Q * NB; 67 dC += id * P * NB; 68 69 // A is P x Q 70 CeedScalar *sA = shared_data; 71 CeedScalar *sB = shared_data + ty * Q * NB; 72 73 CeedScalar rC[NB] = {0.0}; 74 75 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 76 for (int d = 0; d < Q_COMP; d++) { 77 // read A using all threads 78 CeedScalar rA[Q]; 79 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 80 __syncthreads(); 81 82 // read B 83 if (id < nblocks) { 84 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 85 } 86 __syncthreads(); 87 88 addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 89 90 dA += P * Q; 91 dB += Q * n; 92 93 __syncthreads(); 94 } 95 96 // write C 97 if (id < nblocks) { 98 write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 99 } 100 } 101 102 //////////////////////////////////////////////////////////////////////////////// 103 template <typename T, int P, int Q, int NB> 104 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 105 CeedScalar *shared_data) { 106 const int tx = threadIdx.x; 107 const int ty = threadIdx.y; 108 const int id = blockIdx.x * blockDim.y + ty; 109 const int nblocks = (n + NB - 1) / NB; 110 const int myn = min(NB, n - id * NB); 111 112 dB += id * P * NB; 113 dC += id * Q * NB; 114 115 // A is P x Q 116 CeedScalar *sA = shared_data; 117 CeedScalar *sB = shared_data + ty * P * NB; 118 119 // read A using all threads 120 CeedScalar rA[P]; 121 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 122 __syncthreads(); 123 124 // terminate threads with no work 125 if (id >= nblocks) return; 126 127 // read B 128 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 129 __syncthreads(); 130 131 CeedScalar rC[NB]; 132 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 133 134 // write C 135 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 136 } 137 138 //////////////////////////////////////////////////////////////////////////////// 139 template <typename T, int P, int Q, int NB> 140 static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 141 CeedScalar *shared_data) { 142 const int tx = threadIdx.x; 143 const int ty = threadIdx.y; 144 const int id = blockIdx.x * blockDim.y + ty; 145 const int nblocks = (n + NB - 1) / NB; 146 const int myn = min(NB, n - id * NB); 147 148 dB += id * Q * NB; 149 dC += id * P * NB; 150 151 // A is P x Q 152 CeedScalar *sA = shared_data; 153 CeedScalar *sB = shared_data + ty * Q * NB; 154 155 // read A using all threads 156 CeedScalar rA[Q]; 157 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 158 __syncthreads(); 159 160 // terminate threads with no work 161 if (id >= nblocks) return; 162 163 // read B 164 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 165 __syncthreads(); 166 167 CeedScalar rC[NB]; 168 mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 169 170 // write C 171 write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 172 } 173 174 //////////////////////////////////////////////////////////////////////////////// 175 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 176 void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 177 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 178 179 #if BASIS_Q_COMP_INTERP == 1 180 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 181 #else 182 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 183 #endif 184 } 185 186 //////////////////////////////////////////////////////////////////////////////// 187 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 188 void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 189 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 190 191 #if BASIS_Q_COMP_INTERP == 1 192 magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 193 #else 194 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 195 #endif 196 } 197 198 //////////////////////////////////////////////////////////////////////////////// 199 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 200 void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 201 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 202 203 #if BASIS_Q_COMP_DERIV == 1 204 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 205 #else 206 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 207 #endif 208 } 209 210 //////////////////////////////////////////////////////////////////////////////// 211 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 212 void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 213 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 214 215 #if BASIS_Q_COMP_DERIV == 1 216 magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 217 #else 218 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 219 #endif 220 } 221