xref: /libCEED/include/ceed/jit-source/hip/hip-gen-templates.h (revision 07328cdeb7cc6fe1ac9596b69aea7fe9a5beddd3)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP backend macro and type definitions for JiT source
10 #ifndef CEED_HIP_GEN_TEMPLATES_H
11 #define CEED_HIP_GEN_TEMPLATES_H
12 
13 #include <ceed/types.h>
14 
15 //------------------------------------------------------------------------------
16 // Load matrices for basis actions
17 //------------------------------------------------------------------------------
18 template <int P, int Q>
19 inline __device__ void loadMatrix(SharedData_Hip& data, const CeedScalar* d_B, CeedScalar* B) {
20   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
21 }
22 
23 //------------------------------------------------------------------------------
24 // 1D
25 //------------------------------------------------------------------------------
26 
27 //------------------------------------------------------------------------------
28 // L-vector -> E-vector, offsets provided
29 //------------------------------------------------------------------------------
30 template <int NCOMP, int COMPSTRIDE, int P1d>
31 inline __device__ void readDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u,
32                                         CeedScalar* r_u) {
33   if (data.t_id_x < P1d) {
34     const CeedInt node = data.t_id_x;
35     const CeedInt ind  = indices[node + elem * P1d];
36     for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp];
37   }
38 }
39 
40 //------------------------------------------------------------------------------
41 // L-vector -> E-vector, strided
42 //------------------------------------------------------------------------------
43 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
44 inline __device__ void readDofsStrided1d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
45   if (data.t_id_x < P1d) {
46     const CeedInt node = data.t_id_x;
47     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
48     for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
49   }
50 }
51 
52 //------------------------------------------------------------------------------
53 // E-vector -> L-vector, offsets provided
54 //------------------------------------------------------------------------------
55 template <int NCOMP, int COMPSTRIDE, int P1d>
56 inline __device__ void writeDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices,
57                                          const CeedScalar* r_v, CeedScalar* d_v) {
58   if (data.t_id_x < P1d) {
59     const CeedInt node = data.t_id_x;
60     const CeedInt ind  = indices[node + elem * P1d];
61     for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
62   }
63 }
64 
65 //------------------------------------------------------------------------------
66 // E-vector -> L-vector, strided
67 //------------------------------------------------------------------------------
68 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
69 inline __device__ void writeDofsStrided1d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
70   if (data.t_id_x < P1d) {
71     const CeedInt node = data.t_id_x;
72     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
73     for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
74   }
75 }
76 
77 //------------------------------------------------------------------------------
78 // 2D
79 //------------------------------------------------------------------------------
80 
81 //------------------------------------------------------------------------------
82 // L-vector -> E-vector, offsets provided
83 //------------------------------------------------------------------------------
84 template <int NCOMP, int COMPSTRIDE, int P1d>
85 inline __device__ void readDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u,
86                                         CeedScalar* r_u) {
87   if (data.t_id_x < P1d && data.t_id_y < P1d) {
88     const CeedInt node = data.t_id_x + data.t_id_y * P1d;
89     const CeedInt ind  = indices[node + elem * P1d * P1d];
90     for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp];
91   }
92 }
93 
94 //------------------------------------------------------------------------------
95 // L-vector -> E-vector, strided
96 //------------------------------------------------------------------------------
97 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
98 inline __device__ void readDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
99   if (data.t_id_x < P1d && data.t_id_y < P1d) {
100     const CeedInt node = data.t_id_x + data.t_id_y * P1d;
101     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
102     for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
103   }
104 }
105 
106 //------------------------------------------------------------------------------
107 // E-vector -> L-vector, offsets provided
108 //------------------------------------------------------------------------------
109 template <int NCOMP, int COMPSTRIDE, int P1d>
110 inline __device__ void writeDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices,
111                                          const CeedScalar* r_v, CeedScalar* d_v) {
112   if (data.t_id_x < P1d && data.t_id_y < P1d) {
113     const CeedInt node = data.t_id_x + data.t_id_y * P1d;
114     const CeedInt ind  = indices[node + elem * P1d * P1d];
115     for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
116   }
117 }
118 
119 //------------------------------------------------------------------------------
120 // E-vector -> L-vector, strided
121 //------------------------------------------------------------------------------
122 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
123 inline __device__ void writeDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
124   if (data.t_id_x < P1d && data.t_id_y < P1d) {
125     const CeedInt node = data.t_id_x + data.t_id_y * P1d;
126     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
127     for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
128   }
129 }
130 
131 //------------------------------------------------------------------------------
132 // 3D
133 //------------------------------------------------------------------------------
134 
135 //------------------------------------------------------------------------------
136 // L-vector -> E-vector, offsets provided
137 //------------------------------------------------------------------------------
138 template <int NCOMP, int COMPSTRIDE, int P1d>
139 inline __device__ void readDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u,
140                                         CeedScalar* r_u) {
141   if (data.t_id_x < P1d && data.t_id_y < P1d)
142     for (CeedInt z = 0; z < P1d; ++z) {
143       const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d;
144       const CeedInt ind  = indices[node + elem * P1d * P1d * P1d];
145       for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z + comp * P1d] = d_u[ind + COMPSTRIDE * comp];
146     }
147 }
148 
149 //------------------------------------------------------------------------------
150 // L-vector -> E-vector, strided
151 //------------------------------------------------------------------------------
152 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
153 inline __device__ void readDofsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
154   if (data.t_id_x < P1d && data.t_id_y < P1d)
155     for (CeedInt z = 0; z < P1d; ++z) {
156       const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d;
157       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
158       for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z + comp * P1d] = d_u[ind + comp * STRIDES_COMP];
159     }
160 }
161 
162 //------------------------------------------------------------------------------
163 // E-vector -> Q-vector, offests provided
164 //------------------------------------------------------------------------------
165 template <int NCOMP, int COMPSTRIDE, int Q1d>
166 inline __device__ void readSliceQuadsOffset3d(SharedData_Hip& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices,
167                                               const CeedScalar* d_u, CeedScalar* r_u) {
168   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
169     const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d;
170     const CeedInt ind  = indices[node + elem * Q1d * Q1d * Q1d];
171     ;
172     for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp];
173   }
174 }
175 
176 //------------------------------------------------------------------------------
177 // E-vector -> Q-vector, strided
178 //------------------------------------------------------------------------------
179 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
180 inline __device__ void readSliceQuadsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
181   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
182     const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d;
183     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
184     for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
185   }
186 }
187 
188 //------------------------------------------------------------------------------
189 // E-vector -> L-vector, offsets provided
190 //------------------------------------------------------------------------------
191 template <int NCOMP, int COMPSTRIDE, int P1d>
192 inline __device__ void writeDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices,
193                                          const CeedScalar* r_v, CeedScalar* d_v) {
194   if (data.t_id_x < P1d && data.t_id_y < P1d)
195     for (CeedInt z = 0; z < P1d; ++z) {
196       const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d;
197       const CeedInt ind  = indices[node + elem * P1d * P1d * P1d];
198       for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z + comp * P1d]);
199     }
200 }
201 
202 //------------------------------------------------------------------------------
203 // E-vector -> L-vector, strided
204 //------------------------------------------------------------------------------
205 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
206 inline __device__ void writeDofsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
207   if (data.t_id_x < P1d && data.t_id_y < P1d)
208     for (CeedInt z = 0; z < P1d; ++z) {
209       const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d;
210       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
211       for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P1d];
212     }
213 }
214 
215 //------------------------------------------------------------------------------
216 // 3D collocated derivatives computation
217 //------------------------------------------------------------------------------
218 template <int NCOMP, int Q1d>
219 inline __device__ void gradCollo3d(SharedData_Hip& data, const CeedInt q, const CeedScalar* __restrict__ r_U, const CeedScalar* c_G,
220                                    CeedScalar* __restrict__ r_V) {
221   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
222     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
223       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q1d];
224       __syncthreads();
225       // X derivative
226       r_V[comp + 0 * NCOMP] = 0.0;
227       for (CeedInt i = 0; i < Q1d; ++i)
228         r_V[comp + 0 * NCOMP] += c_G[i + data.t_id_x * Q1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
229       // Y derivative
230       r_V[comp + 1 * NCOMP] = 0.0;
231       for (CeedInt i = 0; i < Q1d; ++i)
232         r_V[comp + 1 * NCOMP] += c_G[i + data.t_id_y * Q1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
233       // Z derivative
234       r_V[comp + 2 * NCOMP] = 0.0;
235       for (CeedInt i = 0; i < Q1d; ++i) r_V[comp + 2 * NCOMP] += c_G[i + q * Q1d] * r_U[i + comp * Q1d];  // Contract z direction (Z derivative)
236       __syncthreads();
237     }
238   }
239 }
240 
241 //------------------------------------------------------------------------------
242 // 3D collocated derivatives transpose
243 //------------------------------------------------------------------------------
244 template <int NCOMP, int Q1d>
245 inline __device__ void gradColloTranspose3d(SharedData_Hip& data, const CeedInt q, const CeedScalar* __restrict__ r_U, const CeedScalar* c_G,
246                                             CeedScalar* __restrict__ r_V) {
247   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
248     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
249       // X derivative
250       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NCOMP];
251       __syncthreads();
252       for (CeedInt i = 0; i < Q1d; ++i)
253         r_V[q + comp * Q1d] += c_G[data.t_id_x + i * Q1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
254       __syncthreads();
255       // Y derivative
256       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NCOMP];
257       __syncthreads();
258       for (CeedInt i = 0; i < Q1d; ++i)
259         r_V[q + comp * Q1d] += c_G[data.t_id_y + i * Q1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
260       __syncthreads();
261       // Z derivative
262       for (CeedInt i = 0; i < Q1d; ++i)
263         r_V[i + comp * Q1d] += c_G[i + q * Q1d] * r_U[comp + 2 * NCOMP];  // PARTIAL contract z direction (Z derivative)
264     }
265   }
266 }
267 
268 #endif  // CEED_HIP_GEN_TEMPLATES_H
269