xref: /libCEED/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h (revision db2becc9f302fe8eb3a32ace50ce3f3a5d42e6c4)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for MAGMA non-tensor basis interpolation
10 
11 #include "magma-common-nontensor.h"
12 
13 ////////////////////////////////////////////////////////////////////////////////
14 template <typename T, int Q_COMP, int P, int Q, int NB>
15 static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
16                                                                  CeedScalar *shared_data) {
17   const int tx      = threadIdx.x;
18   const int ty      = threadIdx.y;
19   const int id      = blockIdx.x * blockDim.y + ty;
20   const int nblocks = (n + NB - 1) / NB;
21   const int myn     = min(NB, n - id * NB);
22 
23   dB += id * P * NB;
24   dC += id * Q * NB;
25 
26   // A is P x Q
27   CeedScalar *sB = shared_data + ty * P * NB;
28   CeedScalar *sA = shared_data + blockDim.y * P * NB;
29 
30   // read B once for all C's
31   if (id < nblocks) {
32     read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
33   }
34 
35   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
36   for (int d = 0; d < Q_COMP; d++) {
37     // read A using all threads
38     CeedScalar rA[P];
39     read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
40 
41     CeedScalar rC[NB];
42     mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
43 
44     // write C
45     if (id < nblocks) {
46       write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
47     }
48 
49     dA += Q * P;
50     dC += Q * n;
51 
52     __syncthreads();
53   }
54 }
55 
56 ////////////////////////////////////////////////////////////////////////////////
57 template <typename T, int Q_COMP, int P, int Q, int NB>
58 static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
59                                                                  CeedScalar *shared_data) {
60   const int tx      = threadIdx.x;
61   const int ty      = threadIdx.y;
62   const int id      = blockIdx.x * blockDim.y + ty;
63   const int nblocks = (n + NB - 1) / NB;
64   const int myn     = min(NB, n - id * NB);
65 
66   dB += id * Q * NB;
67   dC += id * P * NB;
68 
69   // A is P x Q
70   CeedScalar *sA = shared_data;
71   CeedScalar *sB = shared_data + ty * Q * NB;
72 
73   CeedScalar rC[NB] = {0.0};
74 
75   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
76   for (int d = 0; d < Q_COMP; d++) {
77     // read A using all threads
78     CeedScalar rA[Q];
79     read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
80     __syncthreads();
81 
82     // read B
83     if (id < nblocks) {
84       read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
85     }
86     __syncthreads();
87 
88     addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
89 
90     dA += P * Q;
91     dB += Q * n;
92 
93     __syncthreads();
94   }
95 
96   // write C
97   if (id < nblocks) {
98     write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
99   }
100 }
101 
102 ////////////////////////////////////////////////////////////////////////////////
103 template <typename T, int Q_COMP, int P, int Q, int NB>
104 static __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC,
105                                                                   CeedScalar *shared_data) {
106   const int tx      = threadIdx.x;
107   const int ty      = threadIdx.y;
108   const int id      = blockIdx.x * blockDim.y + ty;
109   const int nblocks = (n + NB - 1) / NB;
110   const int myn     = min(NB, n - id * NB);
111 
112   dB += id * Q * NB;
113   dC += id * P * NB;
114 
115   // A is P x Q
116   CeedScalar *sA = shared_data;
117   CeedScalar *sB = shared_data + ty * Q * NB;
118 
119   CeedScalar rC[NB] = {0.0};
120 
121   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
122   for (int d = 0; d < Q_COMP; d++) {
123     // read A using all threads
124     CeedScalar rA[Q];
125     read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
126     __syncthreads();
127 
128     // read B
129     if (id < nblocks) {
130       read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
131     }
132     __syncthreads();
133 
134     addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
135 
136     dA += P * Q;
137     dB += Q * n;
138 
139     __syncthreads();
140   }
141 
142   // sum into C
143   if (id < nblocks) {
144     sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
145   }
146 }
147 
148 ////////////////////////////////////////////////////////////////////////////////
149 template <typename T, int P, int Q, int NB>
150 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
151                                                                   CeedScalar *shared_data) {
152   const int tx      = threadIdx.x;
153   const int ty      = threadIdx.y;
154   const int id      = blockIdx.x * blockDim.y + ty;
155   const int nblocks = (n + NB - 1) / NB;
156   const int myn     = min(NB, n - id * NB);
157 
158   dB += id * P * NB;
159   dC += id * Q * NB;
160 
161   // A is P x Q
162   CeedScalar *sA = shared_data;
163   CeedScalar *sB = shared_data + ty * P * NB;
164 
165   // read A using all threads
166   CeedScalar rA[P];
167   read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
168   __syncthreads();
169 
170   // terminate threads with no work
171   if (id >= nblocks) return;
172 
173   // read B
174   read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
175   __syncthreads();
176 
177   CeedScalar rC[NB];
178   mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
179 
180   // write C
181   write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
182 }
183 
184 ////////////////////////////////////////////////////////////////////////////////
185 template <typename T, int P, int Q, int NB>
186 static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
187                                                                   CeedScalar *shared_data) {
188   const int tx      = threadIdx.x;
189   const int ty      = threadIdx.y;
190   const int id      = blockIdx.x * blockDim.y + ty;
191   const int nblocks = (n + NB - 1) / NB;
192   const int myn     = min(NB, n - id * NB);
193 
194   dB += id * Q * NB;
195   dC += id * P * NB;
196 
197   // A is P x Q
198   CeedScalar *sA = shared_data;
199   CeedScalar *sB = shared_data + ty * Q * NB;
200 
201   // read A using all threads
202   CeedScalar rA[Q];
203   read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
204   __syncthreads();
205 
206   // terminate threads with no work
207   if (id >= nblocks) return;
208 
209   // read B
210   read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
211   __syncthreads();
212 
213   CeedScalar rC[NB];
214   mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
215 
216   // write C
217   write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
218 }
219 
220 ////////////////////////////////////////////////////////////////////////////////
221 template <typename T, int P, int Q, int NB>
222 static __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
223                                                                    CeedScalar *shared_data) {
224   const int tx      = threadIdx.x;
225   const int ty      = threadIdx.y;
226   const int id      = blockIdx.x * blockDim.y + ty;
227   const int nblocks = (n + NB - 1) / NB;
228   const int myn     = min(NB, n - id * NB);
229 
230   dB += id * Q * NB;
231   dC += id * P * NB;
232 
233   // A is P x Q
234   CeedScalar *sA = shared_data;
235   CeedScalar *sB = shared_data + ty * Q * NB;
236 
237   // read A using all threads
238   CeedScalar rA[Q];
239   read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
240   __syncthreads();
241 
242   // terminate threads with no work
243   if (id >= nblocks) return;
244 
245   // read B
246   read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
247   __syncthreads();
248 
249   CeedScalar rC[NB];
250   mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
251 
252   // sum into C
253   sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
254 }
255 
256 ////////////////////////////////////////////////////////////////////////////////
257 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
258     void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
259   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
260 
261 #if BASIS_Q_COMP_INTERP == 1
262   magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
263 #else
264   magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
265 #endif
266 }
267 
268 ////////////////////////////////////////////////////////////////////////////////
269 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
270     void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
271   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
272 
273 #if BASIS_Q_COMP_INTERP == 1
274   magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
275 #else
276   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
277 #endif
278 }
279 
280 ////////////////////////////////////////////////////////////////////////////////
281 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
282     void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
283   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
284 
285 #if BASIS_Q_COMP_INTERP == 1
286   magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
287 #else
288   magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
289 #endif
290 }
291 
292 ////////////////////////////////////////////////////////////////////////////////
293 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
294     void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
295   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
296 
297 #if BASIS_Q_COMP_DERIV == 1
298   magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
299 #else
300   magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
301 #endif
302 }
303 
304 ////////////////////////////////////////////////////////////////////////////////
305 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
306     void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
307   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
308 
309 #if BASIS_Q_COMP_DERIV == 1
310   magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
311 #else
312   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
313 #endif
314 }
315 
316 ////////////////////////////////////////////////////////////////////////////////
317 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
318     void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
319   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
320 
321 #if BASIS_Q_COMP_DERIV == 1
322   magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
323 #else
324   magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
325 #endif
326 }
327