1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for MAGMA non-tensor basis interpolation
10 #include "magma-common-nontensor.h"
11
12 ////////////////////////////////////////////////////////////////////////////////
13 template <typename T, int Q_COMP, int P, int Q, int NB>
magma_basis_nontensor_device_n(const int n,CeedScalar const * dA,CeedScalar const * dB,CeedScalar * dC,CeedScalar * shared_data)14 static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
15 CeedScalar *shared_data) {
16 const int tx = threadIdx.x;
17 const int ty = threadIdx.y;
18 const int id = blockIdx.x * blockDim.y + ty;
19 const int nblocks = (n + NB - 1) / NB;
20 const int myn = min(NB, n - id * NB);
21
22 dB += id * P * NB;
23 dC += id * Q * NB;
24
25 // A is P x Q
26 CeedScalar *sB = shared_data + ty * P * NB;
27 CeedScalar *sA = shared_data + blockDim.y * P * NB;
28
29 // read B once for all C's
30 if (id < nblocks) {
31 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
32 }
33
34 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
35 for (int d = 0; d < Q_COMP; d++) {
36 // read A using all threads
37 CeedScalar rA[P];
38 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
39
40 CeedScalar rC[NB];
41 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
42
43 // write C
44 if (id < nblocks) {
45 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
46 }
47
48 dA += Q * P;
49 dC += Q * n;
50
51 __syncthreads();
52 }
53 }
54
55 ////////////////////////////////////////////////////////////////////////////////
56 template <typename T, int Q_COMP, int P, int Q, int NB>
magma_basis_nontensor_device_t(const int n,CeedScalar const * dA,CeedScalar const * dB,CeedScalar * dC,CeedScalar * shared_data)57 static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
58 CeedScalar *shared_data) {
59 const int tx = threadIdx.x;
60 const int ty = threadIdx.y;
61 const int id = blockIdx.x * blockDim.y + ty;
62 const int nblocks = (n + NB - 1) / NB;
63 const int myn = min(NB, n - id * NB);
64
65 dB += id * Q * NB;
66 dC += id * P * NB;
67
68 // A is P x Q
69 CeedScalar *sA = shared_data;
70 CeedScalar *sB = shared_data + ty * Q * NB;
71
72 CeedScalar rC[NB] = {0.0};
73
74 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
75 for (int d = 0; d < Q_COMP; d++) {
76 // read A using all threads
77 CeedScalar rA[Q];
78 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
79 __syncthreads();
80
81 // read B
82 if (id < nblocks) {
83 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
84 }
85 __syncthreads();
86
87 addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
88
89 dA += P * Q;
90 dB += Q * n;
91
92 __syncthreads();
93 }
94
95 // write C
96 if (id < nblocks) {
97 write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
98 }
99 }
100
101 ////////////////////////////////////////////////////////////////////////////////
102 template <typename T, int Q_COMP, int P, int Q, int NB>
magma_basis_nontensor_device_ta(const int n,const CeedScalar * dA,const CeedScalar * dB,CeedScalar * dC,CeedScalar * shared_data)103 static __device__ __inline__ void magma_basis_nontensor_device_ta(const int n, const CeedScalar *dA, const CeedScalar *dB, CeedScalar *dC,
104 CeedScalar *shared_data) {
105 const int tx = threadIdx.x;
106 const int ty = threadIdx.y;
107 const int id = blockIdx.x * blockDim.y + ty;
108 const int nblocks = (n + NB - 1) / NB;
109 const int myn = min(NB, n - id * NB);
110
111 dB += id * Q * NB;
112 dC += id * P * NB;
113
114 // A is P x Q
115 CeedScalar *sA = shared_data;
116 CeedScalar *sB = shared_data + ty * Q * NB;
117
118 CeedScalar rC[NB] = {0.0};
119
120 // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
121 for (int d = 0; d < Q_COMP; d++) {
122 // read A using all threads
123 CeedScalar rA[Q];
124 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
125 __syncthreads();
126
127 // read B
128 if (id < nblocks) {
129 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
130 }
131 __syncthreads();
132
133 addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
134
135 dA += P * Q;
136 dB += Q * n;
137
138 __syncthreads();
139 }
140
141 // sum into C
142 if (id < nblocks) {
143 sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
144 }
145 }
146
147 ////////////////////////////////////////////////////////////////////////////////
148 template <typename T, int P, int Q, int NB>
magma_basis_nontensor_device_n1(const int n,CeedScalar const * dA,CeedScalar const * dB,CeedScalar * dC,CeedScalar * shared_data)149 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
150 CeedScalar *shared_data) {
151 const int tx = threadIdx.x;
152 const int ty = threadIdx.y;
153 const int id = blockIdx.x * blockDim.y + ty;
154 const int nblocks = (n + NB - 1) / NB;
155 const int myn = min(NB, n - id * NB);
156
157 dB += id * P * NB;
158 dC += id * Q * NB;
159
160 // A is P x Q
161 CeedScalar *sA = shared_data;
162 CeedScalar *sB = shared_data + ty * P * NB;
163
164 // read A using all threads
165 CeedScalar rA[P];
166 read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
167 __syncthreads();
168
169 // terminate threads with no work
170 if (id >= nblocks) return;
171
172 // read B
173 read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
174 __syncthreads();
175
176 CeedScalar rC[NB];
177 mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
178
179 // write C
180 write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
181 }
182
183 ////////////////////////////////////////////////////////////////////////////////
184 template <typename T, int P, int Q, int NB>
magma_basis_nontensor_device_t1(const int n,CeedScalar const * dA,CeedScalar const * dB,CeedScalar * dC,CeedScalar * shared_data)185 static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
186 CeedScalar *shared_data) {
187 const int tx = threadIdx.x;
188 const int ty = threadIdx.y;
189 const int id = blockIdx.x * blockDim.y + ty;
190 const int nblocks = (n + NB - 1) / NB;
191 const int myn = min(NB, n - id * NB);
192
193 dB += id * Q * NB;
194 dC += id * P * NB;
195
196 // A is P x Q
197 CeedScalar *sA = shared_data;
198 CeedScalar *sB = shared_data + ty * Q * NB;
199
200 // read A using all threads
201 CeedScalar rA[Q];
202 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
203 __syncthreads();
204
205 // terminate threads with no work
206 if (id >= nblocks) return;
207
208 // read B
209 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
210 __syncthreads();
211
212 CeedScalar rC[NB];
213 mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
214
215 // write C
216 write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
217 }
218
219 ////////////////////////////////////////////////////////////////////////////////
220 template <typename T, int P, int Q, int NB>
magma_basis_nontensor_device_ta1(const int n,CeedScalar const * dA,CeedScalar const * dB,CeedScalar * dC,CeedScalar * shared_data)221 static __device__ __inline__ void magma_basis_nontensor_device_ta1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
222 CeedScalar *shared_data) {
223 const int tx = threadIdx.x;
224 const int ty = threadIdx.y;
225 const int id = blockIdx.x * blockDim.y + ty;
226 const int nblocks = (n + NB - 1) / NB;
227 const int myn = min(NB, n - id * NB);
228
229 dB += id * Q * NB;
230 dC += id * P * NB;
231
232 // A is P x Q
233 CeedScalar *sA = shared_data;
234 CeedScalar *sB = shared_data + ty * Q * NB;
235
236 // read A using all threads
237 CeedScalar rA[Q];
238 read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
239 __syncthreads();
240
241 // terminate threads with no work
242 if (id >= nblocks) return;
243
244 // read B
245 read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
246 __syncthreads();
247
248 CeedScalar rC[NB];
249 mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
250
251 // sum into C
252 sum_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
253 }
254
255 ////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(MAGMA_BASIS_BOUNDS (BASIS_Q,MAGMA_MAXTHREADS_1D))256 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
257 void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
258 MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
259
260 #if BASIS_Q_COMP_INTERP == 1
261 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
262 #else
263 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
264 #endif
265 }
266
267 ////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(MAGMA_BASIS_BOUNDS (BASIS_P,MAGMA_MAXTHREADS_1D))268 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
269 void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
270 MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
271
272 #if BASIS_Q_COMP_INTERP == 1
273 magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
274 #else
275 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
276 #endif
277 }
278
279 ////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(MAGMA_BASIS_BOUNDS (BASIS_P,MAGMA_MAXTHREADS_1D))280 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
281 void magma_interp_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
282 MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
283
284 #if BASIS_Q_COMP_INTERP == 1
285 magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
286 #else
287 magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
288 #endif
289 }
290
291 ////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(MAGMA_BASIS_BOUNDS (BASIS_Q,MAGMA_MAXTHREADS_1D))292 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
293 void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
294 MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
295
296 #if BASIS_Q_COMP_DERIV == 1
297 magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
298 #else
299 magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
300 #endif
301 }
302
303 ////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(MAGMA_BASIS_BOUNDS (BASIS_P,MAGMA_MAXTHREADS_1D))304 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
305 void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
306 MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
307
308 #if BASIS_Q_COMP_DERIV == 1
309 magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
310 #else
311 magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
312 #endif
313 }
314
315 ////////////////////////////////////////////////////////////////////////////////
__launch_bounds__(MAGMA_BASIS_BOUNDS (BASIS_P,MAGMA_MAXTHREADS_1D))316 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
317 void magma_deriv_nontensor_ta(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
318 MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
319
320 #if BASIS_Q_COMP_DERIV == 1
321 magma_basis_nontensor_device_ta1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
322 #else
323 magma_basis_nontensor_device_ta<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
324 #endif
325 }
326