xref: /libCEED/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h (revision 4f69910b6e3819988a1446e35e0e85e74672bc23)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for MAGMA non-tensor basis interpolation
10 #include "magma-common-nontensor.h"
11 
12 ////////////////////////////////////////////////////////////////////////////////
13 template <typename T, int Q_COMP, int P, int Q, int NB>
14 static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
15                                                                  CeedScalar *shared_data) {
16   const int tx      = threadIdx.x;
17   const int ty      = threadIdx.y;
18   const int id      = blockIdx.x * blockDim.y + ty;
19   const int nblocks = (n + NB - 1) / NB;
20   const int myn     = min(NB, n - id * NB);
21 
22   dB += id * P * NB;
23   dC += id * Q * NB;
24 
25   // A is P x Q
26   CeedScalar *sB = shared_data + ty * P * NB;
27   CeedScalar *sA = shared_data + blockDim.y * P * NB;
28 
29   // read B once for all C's
30   if (id < nblocks) {
31     read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
32   }
33 
34   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
35   for (int d = 0; d < Q_COMP; d++) {
36     // read A using all threads
37     CeedScalar rA[P];
38     read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
39 
40     CeedScalar rC[NB];
41     mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
42 
43     // write C
44     if (id < nblocks) {
45       write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
46     }
47 
48     dA += Q * P;
49     dC += Q * n;
50 
51     __syncthreads();
52   }
53 }
54 
55 ////////////////////////////////////////////////////////////////////////////////
56 template <typename T, int Q_COMP, int P, int Q, int NB>
57 static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
58                                                                  CeedScalar *shared_data) {
59   const int tx      = threadIdx.x;
60   const int ty      = threadIdx.y;
61   const int id      = blockIdx.x * blockDim.y + ty;
62   const int nblocks = (n + NB - 1) / NB;
63   const int myn     = min(NB, n - id * NB);
64 
65   dB += id * Q * NB;
66   dC += id * P * NB;
67 
68   // A is P x Q
69   CeedScalar *sA = shared_data;
70   CeedScalar *sB = shared_data + ty * Q * NB;
71 
72   CeedScalar rC[NB] = {0.0};
73 
74   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
75   for (int d = 0; d < Q_COMP; d++) {
76     // read A using all threads
77     CeedScalar rA[Q];
78     read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
79     __syncthreads();
80 
81     // read B
82     if (id < nblocks) {
83       read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
84     }
85     __syncthreads();
86 
87     addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
88 
89     dA += P * Q;
90     dB += Q * n;
91 
92     __syncthreads();
93   }
94 
95   // write C
96   if (id < nblocks) {
97     write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
98   }
99 }
100 
101 ////////////////////////////////////////////////////////////////////////////////
102 template <typename T, int Q_COMP, int P, int Q, int NB>
103 static __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC,
104                                                                   CeedScalar *shared_data) {
105   const int tx      = threadIdx.x;
106   const int ty      = threadIdx.y;
107   const int id      = blockIdx.x * blockDim.y + ty;
108   const int nblocks = (n + NB - 1) / NB;
109   const int myn     = min(NB, n - id * NB);
110 
111   dB += id * Q * NB;
112   dC += id * P * NB;
113 
114   // A is P x Q
115   CeedScalar *sA = shared_data;
116   CeedScalar *sB = shared_data + ty * Q * NB;
117 
118   CeedScalar rC[NB] = {0.0};
119 
120   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
121   for (int d = 0; d < Q_COMP; d++) {
122     // read A using all threads
123     CeedScalar rA[Q];
124     read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
125     __syncthreads();
126 
127     // read B
128     if (id < nblocks) {
129       read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
130     }
131     __syncthreads();
132 
133     addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
134 
135     dA += P * Q;
136     dB += Q * n;
137 
138     __syncthreads();
139   }
140 
141   // sum into C
142   if (id < nblocks) {
143     sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
144   }
145 }
146 
147 ////////////////////////////////////////////////////////////////////////////////
148 template <typename T, int P, int Q, int NB>
149 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
150                                                                   CeedScalar *shared_data) {
151   const int tx      = threadIdx.x;
152   const int ty      = threadIdx.y;
153   const int id      = blockIdx.x * blockDim.y + ty;
154   const int nblocks = (n + NB - 1) / NB;
155   const int myn     = min(NB, n - id * NB);
156 
157   dB += id * P * NB;
158   dC += id * Q * NB;
159 
160   // A is P x Q
161   CeedScalar *sA = shared_data;
162   CeedScalar *sB = shared_data + ty * P * NB;
163 
164   // read A using all threads
165   CeedScalar rA[P];
166   read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
167   __syncthreads();
168 
169   // terminate threads with no work
170   if (id >= nblocks) return;
171 
172   // read B
173   read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
174   __syncthreads();
175 
176   CeedScalar rC[NB];
177   mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
178 
179   // write C
180   write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
181 }
182 
183 ////////////////////////////////////////////////////////////////////////////////
184 template <typename T, int P, int Q, int NB>
185 static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
186                                                                   CeedScalar *shared_data) {
187   const int tx      = threadIdx.x;
188   const int ty      = threadIdx.y;
189   const int id      = blockIdx.x * blockDim.y + ty;
190   const int nblocks = (n + NB - 1) / NB;
191   const int myn     = min(NB, n - id * NB);
192 
193   dB += id * Q * NB;
194   dC += id * P * NB;
195 
196   // A is P x Q
197   CeedScalar *sA = shared_data;
198   CeedScalar *sB = shared_data + ty * Q * NB;
199 
200   // read A using all threads
201   CeedScalar rA[Q];
202   read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
203   __syncthreads();
204 
205   // terminate threads with no work
206   if (id >= nblocks) return;
207 
208   // read B
209   read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
210   __syncthreads();
211 
212   CeedScalar rC[NB];
213   mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
214 
215   // write C
216   write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
217 }
218 
219 ////////////////////////////////////////////////////////////////////////////////
220 template <typename T, int P, int Q, int NB>
221 static __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
222                                                                    CeedScalar *shared_data) {
223   const int tx      = threadIdx.x;
224   const int ty      = threadIdx.y;
225   const int id      = blockIdx.x * blockDim.y + ty;
226   const int nblocks = (n + NB - 1) / NB;
227   const int myn     = min(NB, n - id * NB);
228 
229   dB += id * Q * NB;
230   dC += id * P * NB;
231 
232   // A is P x Q
233   CeedScalar *sA = shared_data;
234   CeedScalar *sB = shared_data + ty * Q * NB;
235 
236   // read A using all threads
237   CeedScalar rA[Q];
238   read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
239   __syncthreads();
240 
241   // terminate threads with no work
242   if (id >= nblocks) return;
243 
244   // read B
245   read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
246   __syncthreads();
247 
248   CeedScalar rC[NB];
249   mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
250 
251   // sum into C
252   sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
253 }
254 
255 ////////////////////////////////////////////////////////////////////////////////
256 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
257     void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
258   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
259 
260 #if BASIS_Q_COMP_INTERP == 1
261   magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
262 #else
263   magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
264 #endif
265 }
266 
267 ////////////////////////////////////////////////////////////////////////////////
268 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
269     void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
270   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
271 
272 #if BASIS_Q_COMP_INTERP == 1
273   magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
274 #else
275   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
276 #endif
277 }
278 
279 ////////////////////////////////////////////////////////////////////////////////
280 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
281     void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
282   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
283 
284 #if BASIS_Q_COMP_INTERP == 1
285   magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
286 #else
287   magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
288 #endif
289 }
290 
291 ////////////////////////////////////////////////////////////////////////////////
292 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
293     void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
294   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
295 
296 #if BASIS_Q_COMP_DERIV == 1
297   magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
298 #else
299   magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
300 #endif
301 }
302 
303 ////////////////////////////////////////////////////////////////////////////////
304 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
305     void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
306   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
307 
308 #if BASIS_Q_COMP_DERIV == 1
309   magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
310 #else
311   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
312 #endif
313 }
314 
315 ////////////////////////////////////////////////////////////////////////////////
316 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
317     void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
318   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
319 
320 #if BASIS_Q_COMP_DERIV == 1
321   magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
322 #else
323   magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
324 #endif
325 }
326