xref: /libCEED/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h (revision 5aed82e4fa97acf4ba24a7f10a35f5303a6798e0)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for MAGMA non-tensor basis interpolation
10 #ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
11 #define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
12 
13 #include "magma-common-nontensor.h"
14 
15 ////////////////////////////////////////////////////////////////////////////////
16 template <typename T, int Q_COMP, int P, int Q, int NB>
17 static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
18                                                                  CeedScalar *shared_data) {
19   const int tx      = threadIdx.x;
20   const int ty      = threadIdx.y;
21   const int id      = blockIdx.x * blockDim.y + ty;
22   const int nblocks = (n + NB - 1) / NB;
23   const int myn     = min(NB, n - id * NB);
24 
25   dB += id * P * NB;
26   dC += id * Q * NB;
27 
28   // A is P x Q
29   CeedScalar *sB = shared_data + ty * P * NB;
30   CeedScalar *sA = shared_data + blockDim.y * P * NB;
31 
32   // read B once for all C's
33   if (id < nblocks) {
34     read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
35   }
36 
37   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
38   for (int d = 0; d < Q_COMP; d++) {
39     // read A using all threads
40     CeedScalar rA[P];
41     read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
42 
43     CeedScalar rC[NB];
44     mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
45 
46     // write C
47     if (id < nblocks) {
48       write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
49     }
50 
51     dA += Q * P;
52     dC += Q * n;
53 
54     __syncthreads();
55   }
56 }
57 
58 ////////////////////////////////////////////////////////////////////////////////
59 template <typename T, int Q_COMP, int P, int Q, int NB>
60 static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
61                                                                  CeedScalar *shared_data) {
62   const int tx      = threadIdx.x;
63   const int ty      = threadIdx.y;
64   const int id      = blockIdx.x * blockDim.y + ty;
65   const int nblocks = (n + NB - 1) / NB;
66   const int myn     = min(NB, n - id * NB);
67 
68   dB += id * Q * NB;
69   dC += id * P * NB;
70 
71   // A is P x Q
72   CeedScalar *sA = shared_data;
73   CeedScalar *sB = shared_data + ty * Q * NB;
74 
75   CeedScalar rC[NB] = {0.0};
76 
77   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
78   for (int d = 0; d < Q_COMP; d++) {
79     // read A using all threads
80     CeedScalar rA[Q];
81     read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
82     __syncthreads();
83 
84     // read B
85     if (id < nblocks) {
86       read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
87     }
88     __syncthreads();
89 
90     addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
91 
92     dA += P * Q;
93     dB += Q * n;
94 
95     __syncthreads();
96   }
97 
98   // write C
99   if (id < nblocks) {
100     write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
101   }
102 }
103 
104 ////////////////////////////////////////////////////////////////////////////////
105 template <typename T, int P, int Q, int NB>
106 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
107                                                                   CeedScalar *shared_data) {
108   const int tx      = threadIdx.x;
109   const int ty      = threadIdx.y;
110   const int id      = blockIdx.x * blockDim.y + ty;
111   const int nblocks = (n + NB - 1) / NB;
112   const int myn     = min(NB, n - id * NB);
113 
114   dB += id * P * NB;
115   dC += id * Q * NB;
116 
117   // A is P x Q
118   CeedScalar *sA = shared_data;
119   CeedScalar *sB = shared_data + ty * P * NB;
120 
121   // read A using all threads
122   CeedScalar rA[P];
123   read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
124   __syncthreads();
125 
126   // terminate threads with no work
127   if (id >= nblocks) return;
128 
129   // read B
130   read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
131   __syncthreads();
132 
133   CeedScalar rC[NB];
134   mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
135 
136   // write C
137   write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
138 }
139 
140 ////////////////////////////////////////////////////////////////////////////////
141 template <typename T, int P, int Q, int NB>
142 static __device__ __inline__ void magma_basis_nontensor_device_t1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
143                                                                   CeedScalar *shared_data) {
144   const int tx      = threadIdx.x;
145   const int ty      = threadIdx.y;
146   const int id      = blockIdx.x * blockDim.y + ty;
147   const int nblocks = (n + NB - 1) / NB;
148   const int myn     = min(NB, n - id * NB);
149 
150   dB += id * Q * NB;
151   dC += id * P * NB;
152 
153   // A is P x Q
154   CeedScalar *sA = shared_data;
155   CeedScalar *sB = shared_data + ty * Q * NB;
156 
157   // read A using all threads
158   CeedScalar rA[Q];
159   read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
160   __syncthreads();
161 
162   // terminate threads with no work
163   if (id >= nblocks) return;
164 
165   // read B
166   read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
167   __syncthreads();
168 
169   CeedScalar rC[NB];
170   mul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
171 
172   // write C
173   write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
174 }
175 
176 ////////////////////////////////////////////////////////////////////////////////
177 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
178     void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
179   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
180 
181 #if BASIS_Q_COMP_INTERP == 1
182   magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
183 #else
184   magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
185 #endif
186 }
187 
188 ////////////////////////////////////////////////////////////////////////////////
189 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
190     void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
191   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
192 
193 #if BASIS_Q_COMP_INTERP == 1
194   magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
195 #else
196   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
197 #endif
198 }
199 
200 ////////////////////////////////////////////////////////////////////////////////
201 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
202     void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
203   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
204 
205 #if BASIS_Q_COMP_DERIV == 1
206   magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
207 #else
208   magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
209 #endif
210 }
211 
212 ////////////////////////////////////////////////////////////////////////////////
213 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
214     void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
215   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
216 
217 #if BASIS_Q_COMP_DERIV == 1
218   magma_basis_nontensor_device_t1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
219 #else
220   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
221 #endif
222 }
223 
224 #endif  // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
225