xref: /libCEED/include/ceed/jit-source/hip/hip-gen-templates.h (revision cc3bdf8cd7a97c52371610b7fbb458a86c2b0cc9)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP backend macro and type definitions for JiT source
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Load matrices for basis actions
14 //------------------------------------------------------------------------------
15 template <int P, int Q>
16 inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
17   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
18 }
19 
20 //------------------------------------------------------------------------------
21 // AtPoints
22 //------------------------------------------------------------------------------
23 
24 //------------------------------------------------------------------------------
25 // L-vector -> single point
26 //------------------------------------------------------------------------------
27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
28 inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
29                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
30   const CeedInt ind = indices[p + elem * NUM_PTS];
31 
32   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
33     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
34   }
35 }
36 
37 //------------------------------------------------------------------------------
38 // Single point -> L-vector
39 //------------------------------------------------------------------------------
40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
41 inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
42                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
43   if (p < points_in_elem) {
44     const CeedInt ind = indices[p + elem * NUM_PTS];
45 
46     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
47       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
48     }
49   }
50 }
51 
52 //------------------------------------------------------------------------------
53 // 1D
54 //------------------------------------------------------------------------------
55 
56 //------------------------------------------------------------------------------
57 // L-vector -> E-vector, offsets provided
58 //------------------------------------------------------------------------------
59 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
60 inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
61                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
62   if (data.t_id_x < P_1d) {
63     const CeedInt node = data.t_id_x;
64     const CeedInt ind  = indices[node + elem * P_1d];
65 
66     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
67   }
68 }
69 
70 //------------------------------------------------------------------------------
71 // L-vector -> E-vector, strided
72 //------------------------------------------------------------------------------
73 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
74 inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
75   if (data.t_id_x < P_1d) {
76     const CeedInt node = data.t_id_x;
77     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
78 
79     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
80   }
81 }
82 
83 //------------------------------------------------------------------------------
84 // E-vector -> L-vector, offsets provided
85 //------------------------------------------------------------------------------
86 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
87 inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
88                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
89   if (data.t_id_x < P_1d) {
90     const CeedInt node = data.t_id_x;
91     const CeedInt ind  = indices[node + elem * P_1d];
92 
93     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
94   }
95 }
96 
97 //------------------------------------------------------------------------------
98 // E-vector -> L-vector, strided
99 //------------------------------------------------------------------------------
100 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
101 inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
102                                           CeedScalar *__restrict__ d_v) {
103   if (data.t_id_x < P_1d) {
104     const CeedInt node = data.t_id_x;
105     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
106 
107     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
108   }
109 }
110 
111 //------------------------------------------------------------------------------
112 // 2D
113 //------------------------------------------------------------------------------
114 
115 //------------------------------------------------------------------------------
116 // L-vector -> E-vector, offsets provided
117 //------------------------------------------------------------------------------
118 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
119 inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
120                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
121   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
122     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
123     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
124 
125     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
126   }
127 }
128 
129 //------------------------------------------------------------------------------
130 // L-vector -> E-vector, strided
131 //------------------------------------------------------------------------------
132 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
133 inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
134   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
135     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
136     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
137 
138     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
139   }
140 }
141 
142 //------------------------------------------------------------------------------
143 // E-vector -> L-vector, offsets provided
144 //------------------------------------------------------------------------------
145 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
146 inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
147                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
148   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
149     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
150     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
151 
152     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
153   }
154 }
155 
156 //------------------------------------------------------------------------------
157 // E-vector -> L-vector, strided
158 //------------------------------------------------------------------------------
159 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
160 inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
161                                           CeedScalar *__restrict__ d_v) {
162   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
163     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
164     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
165 
166     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
167   }
168 }
169 
170 //------------------------------------------------------------------------------
171 // 3D
172 //------------------------------------------------------------------------------
173 
174 //------------------------------------------------------------------------------
175 // L-vector -> E-vector, offsets provided
176 //------------------------------------------------------------------------------
177 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
178 inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
179                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
180   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
181     for (CeedInt z = 0; z < P_1d; z++) {
182       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
183       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
184 
185       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
186     }
187   }
188 }
189 
190 //------------------------------------------------------------------------------
191 // L-vector -> E-vector, strided
192 //------------------------------------------------------------------------------
193 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
194 inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
195   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
196     for (CeedInt z = 0; z < P_1d; z++) {
197       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
198       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
199 
200       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
201     }
202   }
203 }
204 
205 //------------------------------------------------------------------------------
206 // E-vector -> Q-vector, offests provided
207 //------------------------------------------------------------------------------
208 template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
209 inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
210                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
211                                                CeedScalar *__restrict__ r_u) {
212   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
213     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
214     const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
215 
216     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
217   }
218 }
219 
220 //------------------------------------------------------------------------------
221 // E-vector -> Q-vector, strided
222 //------------------------------------------------------------------------------
223 template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
224 inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
225                                               CeedScalar *__restrict__ r_u) {
226   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
227     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
228     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
229 
230     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
231   }
232 }
233 
234 //------------------------------------------------------------------------------
235 // E-vector -> L-vector, offsets provided
236 //------------------------------------------------------------------------------
237 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
238 inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
239                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
240   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
241     for (CeedInt z = 0; z < P_1d; z++) {
242       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
243       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
244 
245       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
246     }
247   }
248 }
249 
250 //------------------------------------------------------------------------------
251 // E-vector -> L-vector, strided
252 //------------------------------------------------------------------------------
253 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
254 inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
255                                           CeedScalar *__restrict__ d_v) {
256   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
257     for (CeedInt z = 0; z < P_1d; z++) {
258       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
259       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
260 
261       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
262     }
263   }
264 }
265 
266 //------------------------------------------------------------------------------
267 // 3D collocated derivatives computation
268 //------------------------------------------------------------------------------
269 template <int NUM_COMP, int Q_1d>
270 inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
271                                         CeedScalar *__restrict__ r_V) {
272   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
273     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
274       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
275       __syncthreads();
276       // X derivative
277       r_V[comp + 0 * NUM_COMP] = 0.0;
278       for (CeedInt i = 0; i < Q_1d; i++) {
279         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];
280       }
281       // Y derivative
282       r_V[comp + 1 * NUM_COMP] = 0.0;
283       for (CeedInt i = 0; i < Q_1d; i++) {
284         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];
285       }
286       // Z derivative
287       r_V[comp + 2 * NUM_COMP] = 0.0;
288       for (CeedInt i = 0; i < Q_1d; i++) {
289         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];
290       }
291       __syncthreads();
292     }
293   }
294 }
295 
296 //------------------------------------------------------------------------------
297 // 3D collocated derivatives transpose
298 //------------------------------------------------------------------------------
299 template <int NUM_COMP, int Q_1d>
300 inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
301                                                  CeedScalar *__restrict__ r_V) {
302   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
303     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
304       // X derivative
305       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
306       __syncthreads();
307       for (CeedInt i = 0; i < Q_1d; i++) {
308         r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];
309       }
310       __syncthreads();
311       // Y derivative
312       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
313       __syncthreads();
314       for (CeedInt i = 0; i < Q_1d; i++) {
315         r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];
316       }
317       __syncthreads();
318       // Z derivative
319       for (CeedInt i = 0; i < Q_1d; i++) {
320         r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];
321       }
322     }
323   }
324 }
325