xref: /libCEED/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h (revision 07d5b98a8feba68a643190b8ea9bcdac5c3e6570)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for MAGMA non-tensor basis interpolation
10 
11 #include "magma-common-nontensor.h"
12 
13 ////////////////////////////////////////////////////////////////////////////////
14 template <typename T, int Q_COMP, int P, int Q, int NB>
15 static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
16                                                                  CeedScalar *shared_data) {
17   const int tx      = threadIdx.x;
18   const int ty      = threadIdx.y;
19   const int id      = blockIdx.x * blockDim.y + ty;
20   const int nblocks = (n + NB - 1) / NB;
21   const int myn     = min(NB, n - id * NB);
22 
23   dB += id * P * NB;
24   dC += id * Q * NB;
25 
26   // A is P x Q
27   CeedScalar *sB = shared_data + ty * P * NB;
28   CeedScalar *sA = shared_data + blockDim.y * P * NB;
29 
30   // read B once for all C's
31   if (id < nblocks) {
32     read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
33   }
34 
35   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
36   for (int d = 0; d < Q_COMP; d++) {
37     // read A using all threads
38     CeedScalar rA[P];
39     read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
40 
41     CeedScalar rC[NB];
42     mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
43 
44     // write C
45     if (id < nblocks) {
46       write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
47     }
48 
49     dA += Q * P;
50     dC += Q * n;
51 
52     __syncthreads();
53   }
54 }
55 
56 ////////////////////////////////////////////////////////////////////////////////
57 template <typename T, int Q_COMP, int P, int Q, int NB>
58 static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
59                                                                  CeedScalar *shared_data) {
60   const int tx      = threadIdx.x;
61   const int ty      = threadIdx.y;
62   const int id      = blockIdx.x * blockDim.y + ty;
63   const int nblocks = (n + NB - 1) / NB;
64   const int myn     = min(NB, n - id * NB);
65 
66   dB += id * Q * NB;
67   dC += id * P * NB;
68 
69   // A is P x Q
70   CeedScalar *sA = shared_data;
71   CeedScalar *sB = shared_data + ty * Q * NB;
72 
73   CeedScalar rC[NB] = {0.0};
74 
75   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
76   for (int d = 0; d < Q_COMP; d++) {
77     // read A using all threads
78     CeedScalar rA[Q];
79     read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
80     __syncthreads();
81 
82     // read B
83     if (id < nblocks) {
84       read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
85     }
86     __syncthreads();
87 
88     addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
89 
90     dA += P * Q;
91     dB += Q * n;
92 
93     __syncthreads();
94   }
95 
96   // write C
97   if (id < nblocks) {
98     write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
99   }
100 }
101 
102 ////////////////////////////////////////////////////////////////////////////////
103 template <typename T, int P, int Q, int NB>
104 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
105                                                                   CeedScalar *shared_data) {
106   const int tx      = threadIdx.x;
107   const int ty      = threadIdx.y;
108   const int id      = blockIdx.x * blockDim.y + ty;
109   const int nblocks = (n + NB - 1) / NB;
110   const int myn     = min(NB, n - id * NB);
111 
112   dB += id * P * NB;
113   dC += id * Q * NB;
114 
115   // A is P x Q
116   CeedScalar *sA = shared_data;
117   CeedScalar *sB = shared_data + ty * P * NB;
118 
119   // read A using all threads
120   CeedScalar rA[P];
121   read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
122   __syncthreads();
123 
124   // terminate threads with no work
125   if (id >= nblocks) return;
126 
127   // read B
128   read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
129   __syncthreads();
130 
131   CeedScalar rC[NB];
132   mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
133 
134   // write C
135   write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
136 }
137 
138 ////////////////////////////////////////////////////////////////////////////////
139 template <typename T, int P, int Q, int NB>
140 static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
141                                                                   CeedScalar *shared_data) {
142   const int tx      = threadIdx.x;
143   const int ty      = threadIdx.y;
144   const int id      = blockIdx.x * blockDim.y + ty;
145   const int nblocks = (n + NB - 1) / NB;
146   const int myn     = min(NB, n - id * NB);
147 
148   dB += id * Q * NB;
149   dC += id * P * NB;
150 
151   // A is P x Q
152   CeedScalar *sA = shared_data;
153   CeedScalar *sB = shared_data + ty * Q * NB;
154 
155   // read A using all threads
156   CeedScalar rA[Q];
157   read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
158   __syncthreads();
159 
160   // terminate threads with no work
161   if (id >= nblocks) return;
162 
163   // read B
164   read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
165   __syncthreads();
166 
167   CeedScalar rC[NB];
168   mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
169 
170   // write C
171   write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
172 }
173 
174 ////////////////////////////////////////////////////////////////////////////////
175 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
176     void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
177   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
178 
179 #if BASIS_Q_COMP_INTERP == 1
180   magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
181 #else
182   magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
183 #endif
184 }
185 
186 ////////////////////////////////////////////////////////////////////////////////
187 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
188     void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
189   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
190 
191 #if BASIS_Q_COMP_INTERP == 1
192   magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
193 #else
194   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
195 #endif
196 }
197 
198 ////////////////////////////////////////////////////////////////////////////////
199 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
200     void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
201   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
202 
203 #if BASIS_Q_COMP_DERIV == 1
204   magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
205 #else
206   magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
207 #endif
208 }
209 
210 ////////////////////////////////////////////////////////////////////////////////
211 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
212     void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
213   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
214 
215 #if BASIS_Q_COMP_DERIV == 1
216   magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
217 #else
218   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
219 #endif
220 }
221