xref: /libCEED/include/ceed/jit-source/hip/hip-gen-templates.h (revision 0183ed61035d97ff853cf8c8e722c0fda76e54df)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP backend macro and type definitions for JiT source
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Load matrices for basis actions
14 //------------------------------------------------------------------------------
15 template <int P, int Q>
16 inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
17   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
18 }
19 
20 //------------------------------------------------------------------------------
21 // AtPoints
22 //------------------------------------------------------------------------------
23 
24 //------------------------------------------------------------------------------
25 // L-vector -> single point
26 //------------------------------------------------------------------------------
27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
28 inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
29                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
30   const CeedInt ind = indices[p + elem * NUM_PTS];
31 
32   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
33     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
34   }
35 }
36 
37 //------------------------------------------------------------------------------
38 // Single point -> L-vector
39 //------------------------------------------------------------------------------
40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
41 inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
42                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
43   if (p < points_in_elem) {
44     const CeedInt ind = indices[p + elem * NUM_PTS];
45 
46     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
47       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
48     }
49   }
50 }
51 
52 //------------------------------------------------------------------------------
53 // 1D
54 //------------------------------------------------------------------------------
55 
56 //------------------------------------------------------------------------------
57 // Set E-vector value
58 //------------------------------------------------------------------------------
59 template <int NUM_COMP, int P_1D>
60 inline __device__ void SetEVecStandard1d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
61   const CeedInt target_comp = n / P_1D;
62   const CeedInt target_node = n % P_1D;
63 
64   if (data.t_id_x == target_node) {
65     r_v[target_comp] = value;
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // L-vector -> E-vector, offsets provided
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
73 inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
74                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
75   if (data.t_id_x < P_1D) {
76     const CeedInt node = data.t_id_x;
77     const CeedInt ind  = indices[node + elem * P_1D];
78 
79     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
80   }
81 }
82 
83 //------------------------------------------------------------------------------
84 // L-vector -> E-vector, strided
85 //------------------------------------------------------------------------------
86 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
87 inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
88   if (data.t_id_x < P_1D) {
89     const CeedInt node = data.t_id_x;
90     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
91 
92     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
93   }
94 }
95 
96 //------------------------------------------------------------------------------
97 // E-vector -> L-vector, offsets provided
98 //------------------------------------------------------------------------------
99 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
100 inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
101                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
102   if (data.t_id_x < P_1D) {
103     const CeedInt node = data.t_id_x;
104     const CeedInt ind  = indices[node + elem * P_1D];
105 
106     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
107   }
108 }
109 
110 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
111 inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
112                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
113                                                   CeedScalar *__restrict__ d_v) {
114   const CeedInt target_comp = n / P_1D;
115   const CeedInt target_node = n % P_1D;
116 
117   if (data.t_id_x == target_node) {
118     const CeedInt ind = indices[target_node + elem * P_1D];
119 
120     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
121   }
122 }
123 
124 //------------------------------------------------------------------------------
125 // E-vector -> L-vector, strided
126 //------------------------------------------------------------------------------
127 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
128 inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
129                                           CeedScalar *__restrict__ d_v) {
130   if (data.t_id_x < P_1D) {
131     const CeedInt node = data.t_id_x;
132     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
133 
134     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
135   }
136 }
137 
138 //------------------------------------------------------------------------------
139 // 2D
140 //------------------------------------------------------------------------------
141 
142 //------------------------------------------------------------------------------
143 // Set E-vector value
144 //------------------------------------------------------------------------------
145 template <int NUM_COMP, int P_1D>
146 inline __device__ void SetEVecStandard2d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
147   const CeedInt target_comp   = n / (P_1D * P_1D);
148   const CeedInt target_node_x = n % P_1D;
149   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
150 
151   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
152     r_v[target_comp] = value;
153   }
154 }
155 
156 //------------------------------------------------------------------------------
157 // L-vector -> E-vector, offsets provided
158 //------------------------------------------------------------------------------
159 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
160 inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
161                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
162   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
163     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
164     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
165 
166     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
167   }
168 }
169 
170 //------------------------------------------------------------------------------
171 // L-vector -> E-vector, strided
172 //------------------------------------------------------------------------------
173 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
174 inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
175   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
176     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
177     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
178 
179     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
180   }
181 }
182 
183 //------------------------------------------------------------------------------
184 // E-vector -> L-vector, offsets provided
185 //------------------------------------------------------------------------------
186 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
187 inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
188                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
189   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
190     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
191     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
192 
193     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
194   }
195 }
196 
197 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
198 inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
199                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
200                                                   CeedScalar *__restrict__ d_v) {
201   const CeedInt target_comp   = n / (P_1D * P_1D);
202   const CeedInt target_node_x = n % P_1D;
203   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
204 
205   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
206     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
207     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
208 
209     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
210   }
211 }
212 
213 //------------------------------------------------------------------------------
214 // E-vector -> L-vector, strided
215 //------------------------------------------------------------------------------
216 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
217 inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
218                                           CeedScalar *__restrict__ d_v) {
219   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
220     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
221     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
222 
223     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
224   }
225 }
226 
227 //------------------------------------------------------------------------------
228 // 3D
229 //------------------------------------------------------------------------------
230 
231 //------------------------------------------------------------------------------
232 // Set E-vector value
233 //------------------------------------------------------------------------------
234 template <int NUM_COMP, int P_1D>
235 inline __device__ void SetEVecStandard3d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
236   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
237   const CeedInt target_node_x = n % P_1D;
238   const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
239   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
240 
241   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
242     r_v[target_node_z + target_comp * P_1D] = value;
243   }
244 }
245 
246 //------------------------------------------------------------------------------
247 // L-vector -> E-vector, offsets provided
248 //------------------------------------------------------------------------------
249 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
250 inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
251                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
252   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
253     for (CeedInt z = 0; z < P_1D; z++) {
254       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
255       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
256 
257       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
258     }
259   }
260 }
261 
262 //------------------------------------------------------------------------------
263 // L-vector -> E-vector, strided
264 //------------------------------------------------------------------------------
265 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
266 inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
267   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
268     for (CeedInt z = 0; z < P_1D; z++) {
269       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
270       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
271 
272       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
273     }
274   }
275 }
276 
277 //------------------------------------------------------------------------------
278 // E-vector -> Q-vector, offests provided
279 //------------------------------------------------------------------------------
280 template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
281 inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
282                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
283                                                CeedScalar *__restrict__ r_u) {
284   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
285     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
286     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
287 
288     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
289   }
290 }
291 
292 //------------------------------------------------------------------------------
293 // E-vector -> Q-vector, strided
294 //------------------------------------------------------------------------------
295 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
296 inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
297                                               CeedScalar *__restrict__ r_u) {
298   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
299     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
300     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
301 
302     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
303   }
304 }
305 
306 //------------------------------------------------------------------------------
307 // E-vector -> L-vector, offsets provided
308 //------------------------------------------------------------------------------
309 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
310 inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
311                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
312   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
313     for (CeedInt z = 0; z < P_1D; z++) {
314       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
315       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
316 
317       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
318     }
319   }
320 }
321 
322 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
323 inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
324                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
325                                                   CeedScalar *__restrict__ d_v) {
326   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
327   const CeedInt target_node_x = n % P_1D;
328   const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
329   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
330 
331   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
332     const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
333     const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
334 
335     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
336   }
337 }
338 
339 //------------------------------------------------------------------------------
340 // E-vector -> L-vector, strided
341 //------------------------------------------------------------------------------
342 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
343 inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
344                                           CeedScalar *__restrict__ d_v) {
345   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
346     for (CeedInt z = 0; z < P_1D; z++) {
347       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
348       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
349 
350       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
351     }
352   }
353 }
354 
355 //------------------------------------------------------------------------------
356 // 3D collocated derivatives computation
357 //------------------------------------------------------------------------------
358 template <int NUM_COMP, int Q_1D, int T_1D>
359 inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
360                                         CeedScalar *__restrict__ r_V) {
361   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
362     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
363       __syncthreads();
364       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
365       __syncthreads();
366       // X derivative
367       r_V[comp + 0 * NUM_COMP] = 0.0;
368       for (CeedInt i = 0; i < Q_1D; i++) {
369         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
370       }
371       // Y derivative
372       r_V[comp + 1 * NUM_COMP] = 0.0;
373       for (CeedInt i = 0; i < Q_1D; i++) {
374         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
375       }
376       // Z derivative
377       r_V[comp + 2 * NUM_COMP] = 0.0;
378       for (CeedInt i = 0; i < Q_1D; i++) {
379         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
380       }
381     }
382   }
383 }
384 
385 //------------------------------------------------------------------------------
386 // 3D collocated derivatives transpose
387 //------------------------------------------------------------------------------
388 template <int NUM_COMP, int Q_1D, int T_1D>
389 inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
390                                                  CeedScalar *__restrict__ r_V) {
391   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
392     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
393       // X derivative
394       __syncthreads();
395       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
396       __syncthreads();
397       for (CeedInt i = 0; i < Q_1D; i++) {
398         r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
399       }
400       // Y derivative
401       __syncthreads();
402       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
403       __syncthreads();
404       for (CeedInt i = 0; i < Q_1D; i++) {
405         r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
406       }
407       // Z derivative
408       for (CeedInt i = 0; i < Q_1D; i++) {
409         r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
410       }
411     }
412   }
413 }
414