1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for MAGMA non-tensor basis interpolation 10 #ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 11 #define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 12 13 #include "magma-common-nontensor.h" 14 15 //////////////////////////////////////////////////////////////////////////////// 16 template <typename T, int Q_COMP, int P, int Q, int NB> 17 static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 18 CeedScalar *shared_data) { 19 const int tx = threadIdx.x; 20 const int ty = threadIdx.y; 21 const int id = blockIdx.x * blockDim.y + ty; 22 const int nblocks = (n + NB - 1) / NB; 23 const int myn = min(NB, n - id * NB); 24 25 dB += id * P * NB; 26 dC += id * Q * NB; 27 28 // A is P x Q 29 CeedScalar *sB = shared_data + ty * P * NB; 30 CeedScalar *sA = shared_data + blockDim.y * P * NB; 31 32 // read B once for all C's 33 if (id < nblocks) { 34 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 35 } 36 37 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 38 for (int d = 0; d < Q_COMP; d++) { 39 // init rA, rC 40 CeedScalar rA[P], rC[NB]; 41 42 // read A using all threads 43 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 44 45 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 46 47 // write C 48 if (id < nblocks) { 49 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 50 } 51 52 dA += Q * P; 53 dC += Q * n; 54 55 __syncthreads(); 56 } 57 } 58 59 //////////////////////////////////////////////////////////////////////////////// 60 template <typename T, int P, int Q, int NB> 61 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 62 CeedScalar *shared_data) { 63 const int tx = threadIdx.x; 64 const int ty = threadIdx.y; 65 const int id = blockIdx.x * blockDim.y + ty; 66 const int nblocks = (n + NB - 1) / NB; 67 const int myn = min(NB, n - id * NB); 68 69 dB += id * P * NB; 70 dC += id * Q * NB; 71 72 // A is P x Q 73 CeedScalar *sA = shared_data; 74 CeedScalar *sB = shared_data + ty * P * NB; 75 76 // init rA, rC 77 CeedScalar rA[P], rC[NB]; 78 79 // read A using all threads 80 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 81 __syncthreads(); 82 83 // terminate threads with no work 84 if (id >= nblocks) return; 85 86 // read B 87 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB); 88 __syncthreads(); 89 90 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC); 91 92 // write C 93 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC); 94 } 95 96 //////////////////////////////////////////////////////////////////////////////// 97 template <typename T, int Q_COMP, int P, int Q, int NB> 98 static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC, 99 CeedScalar *shared_data) { 100 const int tx = threadIdx.x; 101 const int ty = threadIdx.y; 102 const int id = blockIdx.x * blockDim.y + ty; 103 const int nblocks = (n + NB - 1) / NB; 104 const int myn = min(NB, n - id * NB); 105 106 dB += id * Q * NB; 107 dC += id * P * NB; 108 109 // A is P x Q 110 CeedScalar *sA = shared_data; 111 CeedScalar *sB = shared_data + ty * Q * NB; 112 113 // init rC 114 CeedScalar rC[NB] = {0.0}; 115 116 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll) 117 for (int d = 0; d < Q_COMP; d++) { 118 // init rA 119 CeedScalar rA[Q]; 120 121 // read A using all threads 122 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA); 123 __syncthreads(); 124 125 // read B 126 if (id < nblocks) { 127 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB); 128 } 129 __syncthreads(); 130 131 addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC); 132 133 dA += P * Q; 134 dB += Q * n; 135 136 __syncthreads(); 137 } 138 139 // write C 140 if (id < nblocks) { 141 write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC); 142 } 143 } 144 145 //////////////////////////////////////////////////////////////////////////////// 146 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 147 void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 148 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 149 150 if (BASIS_Q_COMP_INTERP == 1) { 151 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 152 } else { 153 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 154 } 155 } 156 157 //////////////////////////////////////////////////////////////////////////////// 158 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 159 void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 160 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 161 162 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 163 } 164 165 //////////////////////////////////////////////////////////////////////////////// 166 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ 167 void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 168 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 169 170 if (BASIS_Q_COMP_DERIV == 1) { 171 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 172 } else { 173 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data); 174 } 175 } 176 177 //////////////////////////////////////////////////////////////////////////////// 178 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ 179 void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) { 180 MAGMA_DEVICE_SHARED(CeedScalar, shared_data); 181 182 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data); 183 } 184 185 #endif // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H 186