xref: /libCEED/include/ceed/jit-source/magma/magma-basis-interp-deriv-nontensor.h (revision 9d15e85b4f78ffb2d2860753c87a3b1789cc3bb6)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for MAGMA non-tensor basis interpolation
10 #ifndef CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
11 #define CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
12 
13 #include "magma-common-nontensor.h"
14 
15 ////////////////////////////////////////////////////////////////////////////////
16 template <typename T, int Q_COMP, int P, int Q, int NB>
17 static __device__ __inline__ void magma_basis_nontensor_device_n(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
18                                                                  CeedScalar *shared_data) {
19   const int tx      = threadIdx.x;
20   const int ty      = threadIdx.y;
21   const int id      = blockIdx.x * blockDim.y + ty;
22   const int nblocks = (n + NB - 1) / NB;
23   const int myn     = min(NB, n - id * NB);
24 
25   dB += id * P * NB;
26   dC += id * Q * NB;
27 
28   // A is P x Q
29   CeedScalar *sB = shared_data + ty * P * NB;
30   CeedScalar *sA = shared_data + blockDim.y * P * NB;
31 
32   // read B once for all C's
33   if (id < nblocks) {
34     read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
35   }
36 
37   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
38   for (int d = 0; d < Q_COMP; d++) {
39     // init rA, rC
40     CeedScalar rA[P], rC[NB];
41 
42     // read A using all threads
43     read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
44 
45     if (id < nblocks) {
46       mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
47 
48       // write C
49       write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
50     }
51 
52     dA += Q * P;
53     dC += Q * n;
54 
55     __syncthreads();
56   }
57 }
58 
59 ////////////////////////////////////////////////////////////////////////////////
60 template <typename T, int P, int Q, int NB>
61 static __device__ __inline__ void magma_basis_nontensor_device_n1(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
62                                                                   CeedScalar *shared_data) {
63   const int tx      = threadIdx.x;
64   const int ty      = threadIdx.y;
65   const int id      = blockIdx.x * blockDim.y + ty;
66   const int nblocks = (n + NB - 1) / NB;
67   const int myn     = min(NB, n - id * NB);
68 
69   dB += id * P * NB;
70   dC += id * Q * NB;
71 
72   // A is P x Q
73   CeedScalar *sA = shared_data;
74   CeedScalar *sB = shared_data + ty * P * NB;
75 
76   // init rA, rC
77   CeedScalar rA[P], rC[NB];
78 
79   // read A using all threads
80   read_A_trans_g2r_1D_nosync<CeedScalar, Q, P, MAGMA_BASIS_NTCOL(Q, MAGMA_MAXTHREADS_1D)>(tx, ty, dA, sA, rA);
81   __syncthreads();
82 
83   // terminate threads with no work
84   if (id >= nblocks) return;
85 
86   // read B once for all C's
87   read_B_g2s_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, dB, sB);
88   __syncthreads();
89 
90   mul_rAsBrC_1D_nosync<CeedScalar, Q, P, NB>(rA, sB, rC);
91 
92   // write C
93   write_C_r2g_1D_nosync<CeedScalar, Q, P, NB>(tx, myn, rC, dC);
94 }
95 
96 ////////////////////////////////////////////////////////////////////////////////
97 template <typename T, int Q_COMP, int P, int Q, int NB>
98 static __device__ __inline__ void magma_basis_nontensor_device_t(const int n, CeedScalar const *dA, CeedScalar const *dB, CeedScalar *dC,
99                                                                  CeedScalar *shared_data) {
100   const int tx      = threadIdx.x;
101   const int ty      = threadIdx.y;
102   const int id      = blockIdx.x * blockDim.y + ty;
103   const int nblocks = (n + NB - 1) / NB;
104   const int myn     = min(NB, n - id * NB);
105 
106   // terminate threads with no work
107   if (id >= nblocks) return;
108 
109   dB += id * Q * NB;
110   dC += id * P * NB;
111 
112   // A is P x Q
113   CeedScalar *sB = shared_data + ty * Q * NB;
114 
115   // init rC
116   CeedScalar rC[NB] = {0.0};
117 
118   // unrolling this loop yields dramatic performance drop using hipcc, so let the compiler decide (no pragma unroll)
119   for (int d = 0; d < Q_COMP; d++) {
120     // init rA
121     CeedScalar rA[Q];
122 
123     // read A
124     read_A_notrans_g2r_1D_nosync<CeedScalar, P, Q, NB>(tx, dA, rA);
125 
126     // read B
127     read_B_g2s_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, dB, sB);
128     __syncthreads();
129 
130     addmul_rAsBrC_1D_nosync<CeedScalar, P, Q, NB>(rA, sB, rC);
131 
132     dA += P * Q;
133     dB += Q * n;
134 
135     __syncthreads();
136   }
137 
138   // write C
139   write_C_r2g_1D_nosync<CeedScalar, P, Q, NB>(tx, myn, rC, dC);
140 }
141 
142 ////////////////////////////////////////////////////////////////////////////////
143 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
144     void magma_interp_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
145   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
146 
147   if (BASIS_Q_COMP_INTERP == 1) {
148     magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
149   } else {
150     magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
151   }
152 }
153 
154 ////////////////////////////////////////////////////////////////////////////////
155 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
156     void magma_interp_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
157   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
158 
159   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_INTERP, BASIS_P, BASIS_Q, BASIS_NB_INTERP_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
160 }
161 
162 ////////////////////////////////////////////////////////////////////////////////
163 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__
164     void magma_deriv_nontensor_n(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
165   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
166 
167   if (BASIS_Q_COMP_DERIV == 1) {
168     magma_basis_nontensor_device_n1<CeedScalar, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
169   } else {
170     magma_basis_nontensor_device_n<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_N>(n, dA, dB, dC, (CeedScalar *)shared_data);
171   }
172 }
173 
174 ////////////////////////////////////////////////////////////////////////////////
175 extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__
176     void magma_deriv_nontensor_t(const int n, CeedScalar const *__restrict__ dA, CeedScalar const *__restrict__ dB, CeedScalar *__restrict__ dC) {
177   MAGMA_DEVICE_SHARED(CeedScalar, shared_data);
178 
179   magma_basis_nontensor_device_t<CeedScalar, BASIS_Q_COMP_DERIV, BASIS_P, BASIS_Q, BASIS_NB_DERIV_T>(n, dA, dB, dC, (CeedScalar *)shared_data);
180 }
181 
182 #endif  // CEED_MAGMA_BASIS_INTERP_DERIV_NONTENSOR_H
183