xref: /libCEED/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h (revision 22a2f50b669dfdc8ebd97c8d1809aa5206a4c587) !
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for MAGMA non-tensor basis interpolation
10 #ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
11 #define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
12 
13 #include "magma-common-nontensor.h"
14 
15 ////////////////////////////////////////////////////////////////////////////////
16 template <typename T, int Q_COMP, int P, int Q, int NB>
17 static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
18                                                                  CeedScalar *shared_data) {
19   const int tx      = threadIdx.x;
20   const int ty      = threadIdx.y;
21   const int id      = blockIdx.x * blockDim.y + ty;
22   const int nblocks = (n + NB - 1) / NB;
23   const int myn     = min(NB, n - id * NB);
24 
25   dB += id * P * NB;
26   dC += id * Q * NB;
27 
28   // A is P x Q
29   CeedScalar *sB = shared_data + ty * P * NB;
30   CeedScalar *sA = shared_data + blockDim.y * P * NB;
31 
32   // read B once for all C's
33   if (id < nblocks) {
34     read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
35   }
36 
37   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
38   for (int d = 0; d < Q_COMP; d++) {
39     // init rA, rC
40     CeedScalar rA[P], rC[NB];
41 
42     // read A using all threads
43     read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
44 
45     mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
46 
47     // write C
48     if (id < nblocks) {
49       write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
50     }
51 
52     dA += Q * P;
53     dC += Q * n;
54 
55     __syncthreads();
56   }
57 }
58 
59 ////////////////////////////////////////////////////////////////////////////////
60 template <typename T, int P, int Q, int NB>
61 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
62                                                                   CeedScalar *shared_data) {
63   const int tx      = threadIdx.x;
64   const int ty      = threadIdx.y;
65   const int id      = blockIdx.x * blockDim.y + ty;
66   const int nblocks = (n + NB - 1) / NB;
67   const int myn     = min(NB, n - id * NB);
68 
69   dB += id * P * NB;
70   dC += id * Q * NB;
71 
72   // A is P x Q
73   CeedScalar *sA = shared_data;
74   CeedScalar *sB = shared_data + ty * P * NB;
75 
76   // init rA, rC
77   CeedScalar rA[P], rC[NB];
78 
79   // read A using all threads
80   read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
81   __syncthreads();
82 
83   // terminate threads with no work
84   if (id >= nblocks) return;
85 
86   // read B
87   read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
88   __syncthreads();
89 
90   mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
91 
92   // write C
93   write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
94 }
95 
96 ////////////////////////////////////////////////////////////////////////////////
97 template <typename T, int Q_COMP, int P, int Q, int NB>
98 static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
99                                                                  CeedScalar *shared_data) {
100   const int tx      = threadIdx.x;
101   const int ty      = threadIdx.y;
102   const int id      = blockIdx.x * blockDim.y + ty;
103   const int nblocks = (n + NB - 1) / NB;
104   const int myn     = min(NB, n - id * NB);
105 
106   dB += id * Q * NB;
107   dC += id * P * NB;
108 
109   // A is P x Q
110   CeedScalar *sA = shared_data;
111   CeedScalar *sB = shared_data + ty * Q * NB;
112 
113   // init rC
114   CeedScalar rC[NB] = {0.0};
115 
116   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
117   for (int d = 0; d < Q_COMP; d++) {
118     // init rA
119     CeedScalar rA[Q];
120 
121     // read A using all threads
122     read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, MAGMA_BASIS_NTCOL(P, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
123     __syncthreads();
124 
125     // read B
126     if (id < nblocks) {
127       read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
128     }
129     __syncthreads();
130 
131     addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
132 
133     dA += P * Q;
134     dB += Q * n;
135 
136     __syncthreads();
137   }
138 
139   // write C
140   if (id < nblocks) {
141     write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
142   }
143 }
144 
145 ////////////////////////////////////////////////////////////////////////////////
146 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
147     void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
148   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
149 
150   if (BASIS_Q_COMP_INTERP == 1) {
151     magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
152   } else {
153     magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
154   }
155 }
156 
157 ////////////////////////////////////////////////////////////////////////////////
158 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
159     void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
160   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
161 
162   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
163 }
164 
165 ////////////////////////////////////////////////////////////////////////////////
166 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
167     void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
168   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
169 
170   if (BASIS_Q_COMP_DERIV == 1) {
171     magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
172   } else {
173     magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
174   }
175 }
176 
177 ////////////////////////////////////////////////////////////////////////////////
178 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
179     void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
180   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
181 
182   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
183 }
184 
185 #endif  // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
186