xref: /libCEED/include/ceed/jit-source/hip/hip-gen-templates.h (revision f36e753100926c940e21f506e95c0b0794520501)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP backend macro and type definitions for JiT source
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Load matrices for basis actions
14 //------------------------------------------------------------------------------
15 template <int P, int Q>
16 inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
17   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
18 }
19 
20 //------------------------------------------------------------------------------
21 // AtPoints
22 //------------------------------------------------------------------------------
23 
24 //------------------------------------------------------------------------------
25 // L-vector -> single point
26 //------------------------------------------------------------------------------
27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
28 inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
29                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
30   const CeedInt ind = indices[p + elem * NUM_PTS];
31 
32   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
33     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
34   }
35 }
36 
37 //------------------------------------------------------------------------------
38 // Single point -> L-vector
39 //------------------------------------------------------------------------------
40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
41 inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
42                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
43   if (p < points_in_elem) {
44     const CeedInt ind = indices[p + elem * NUM_PTS];
45 
46     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
47       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
48     }
49   }
50 }
51 
52 //------------------------------------------------------------------------------
53 // 1D
54 //------------------------------------------------------------------------------
55 
56 //------------------------------------------------------------------------------
57 // Set E-vector value
58 //------------------------------------------------------------------------------
59 template <int NUM_COMP, int P_1D>
60 inline __device__ void SetEVecStandard1d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
61   const CeedInt target_comp = n / P_1D;
62   const CeedInt target_node = n % P_1D;
63 
64   if (data.t_id_x == target_node) {
65     r_v[target_comp] = value;
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // L-vector -> E-vector, offsets provided
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
73 inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
74                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
75   if (data.t_id_x < P_1D) {
76     const CeedInt node = data.t_id_x;
77     const CeedInt ind  = indices[node + elem * P_1D];
78 
79     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
80   }
81 }
82 
83 //------------------------------------------------------------------------------
84 // L-vector -> E-vector, strided
85 //------------------------------------------------------------------------------
86 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
87 inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
88   if (data.t_id_x < P_1D) {
89     const CeedInt node = data.t_id_x;
90     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
91 
92     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
93   }
94 }
95 
96 //------------------------------------------------------------------------------
97 // E-vector -> L-vector, offsets provided
98 //------------------------------------------------------------------------------
99 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
100 inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
101                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
102   if (data.t_id_x < P_1D) {
103     const CeedInt node = data.t_id_x;
104     const CeedInt ind  = indices[node + elem * P_1D];
105 
106     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
107   }
108 }
109 
110 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
111 inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
112                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
113                                                   CeedScalar *__restrict__ d_v) {
114   const CeedInt target_comp = n / P_1D;
115   const CeedInt target_node = n % P_1D;
116 
117   if (data.t_id_x == target_node) {
118     const CeedInt ind = indices[target_node + elem * P_1D];
119 
120     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
121   }
122 }
123 
124 //------------------------------------------------------------------------------
125 // E-vector -> L-vector, full assembly
126 //------------------------------------------------------------------------------
127 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
128 inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
129                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
130   const CeedInt in_comp    = in / P_1D;
131   const CeedInt in_node    = in % P_1D;
132   const CeedInt e_vec_size = P_1D * NUM_COMP;
133 
134   if (data.t_id_x < P_1D) {
135     const CeedInt out_node = data.t_id_x;
136 
137     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138       d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp];
139     }
140   }
141 }
142 
143 //------------------------------------------------------------------------------
144 // E-vector -> L-vector, Qfunction assembly
145 //------------------------------------------------------------------------------
146 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
147 inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
148                                                       const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
149   if (data.t_id_x < Q_1D) {
150     const CeedInt ind = data.t_id_x + elem * Q_1D;
151 
152     for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
153       d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp];
154     }
155   }
156 }
157 
158 //------------------------------------------------------------------------------
159 // E-vector -> L-vector, strided
160 //------------------------------------------------------------------------------
161 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
162 inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
163                                           CeedScalar *__restrict__ d_v) {
164   if (data.t_id_x < P_1D) {
165     const CeedInt node = data.t_id_x;
166     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
167 
168     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
169   }
170 }
171 
172 //------------------------------------------------------------------------------
173 // 2D
174 //------------------------------------------------------------------------------
175 
176 //------------------------------------------------------------------------------
177 // Set E-vector value
178 //------------------------------------------------------------------------------
179 template <int NUM_COMP, int P_1D>
180 inline __device__ void SetEVecStandard2d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
181   const CeedInt target_comp   = n / (P_1D * P_1D);
182   const CeedInt target_node_x = n % P_1D;
183   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
184 
185   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
186     r_v[target_comp] = value;
187   }
188 }
189 
190 //------------------------------------------------------------------------------
191 // L-vector -> E-vector, offsets provided
192 //------------------------------------------------------------------------------
193 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
194 inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
195                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
196   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
197     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
198     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
199 
200     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
201   }
202 }
203 
204 //------------------------------------------------------------------------------
205 // L-vector -> E-vector, strided
206 //------------------------------------------------------------------------------
207 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
208 inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
209   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
210     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
211     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
212 
213     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
214   }
215 }
216 
217 //------------------------------------------------------------------------------
218 // E-vector -> L-vector, offsets provided
219 //------------------------------------------------------------------------------
220 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
221 inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
222                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
223   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
224     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
225     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
226 
227     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
228   }
229 }
230 
231 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
232 inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
233                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
234                                                   CeedScalar *__restrict__ d_v) {
235   const CeedInt target_comp   = n / (P_1D * P_1D);
236   const CeedInt target_node_x = n % P_1D;
237   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
238 
239   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
240     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
241     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
242 
243     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
244   }
245 }
246 
247 //------------------------------------------------------------------------------
248 // E-vector -> L-vector, full assembly
249 //------------------------------------------------------------------------------
250 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
251 inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
252                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
253   const CeedInt elem_size  = P_1D * P_1D;
254   const CeedInt in_comp    = in / elem_size;
255   const CeedInt in_node_x  = in % P_1D;
256   const CeedInt in_node_y  = (in % elem_size) / P_1D;
257   const CeedInt e_vec_size = elem_size * NUM_COMP;
258 
259   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
260     const CeedInt in_node  = in_node_x + in_node_y * P_1D;
261     const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D;
262 
263     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
264       const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
265 
266       d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp];
267     }
268   }
269 }
270 
271 //------------------------------------------------------------------------------
272 // E-vector -> L-vector, Qfunction assembly
273 //------------------------------------------------------------------------------
274 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
275 inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
276                                                       const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
277   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
278     const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D;
279 
280     for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
281       d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp];
282     }
283   }
284 }
285 
286 //------------------------------------------------------------------------------
287 // E-vector -> L-vector, strided
288 //------------------------------------------------------------------------------
289 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
290 inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
291                                           CeedScalar *__restrict__ d_v) {
292   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
293     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
294     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
295 
296     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
297   }
298 }
299 
300 //------------------------------------------------------------------------------
301 // 3D
302 //------------------------------------------------------------------------------
303 
304 //------------------------------------------------------------------------------
305 // Set E-vector value
306 //------------------------------------------------------------------------------
307 template <int NUM_COMP, int P_1D>
308 inline __device__ void SetEVecStandard3d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
309   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
310   const CeedInt target_node_x = n % P_1D;
311   const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
312   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
313 
314   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
315     r_v[target_node_z + target_comp * P_1D] = value;
316   }
317 }
318 
319 //------------------------------------------------------------------------------
320 // L-vector -> E-vector, offsets provided
321 //------------------------------------------------------------------------------
322 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
323 inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
324                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
325   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
326     for (CeedInt z = 0; z < P_1D; z++) {
327       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
328       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
329 
330       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
331     }
332   }
333 }
334 
335 //------------------------------------------------------------------------------
336 // L-vector -> E-vector, strided
337 //------------------------------------------------------------------------------
338 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
339 inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
340   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
341     for (CeedInt z = 0; z < P_1D; z++) {
342       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
343       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
344 
345       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
346     }
347   }
348 }
349 
350 //------------------------------------------------------------------------------
351 // E-vector -> Q-vector, offests provided
352 //------------------------------------------------------------------------------
353 template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
354 inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
355                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
356                                                CeedScalar *__restrict__ r_u) {
357   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
358     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
359     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
360 
361     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
362   }
363 }
364 
365 //------------------------------------------------------------------------------
366 // E-vector -> Q-vector, strided
367 //------------------------------------------------------------------------------
368 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
369 inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
370                                               CeedScalar *__restrict__ r_u) {
371   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
372     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
373     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
374 
375     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
376   }
377 }
378 
379 //------------------------------------------------------------------------------
380 // E-vector -> L-vector, offsets provided
381 //------------------------------------------------------------------------------
382 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
383 inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
384                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
385   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
386     for (CeedInt z = 0; z < P_1D; z++) {
387       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
388       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
389 
390       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
391     }
392   }
393 }
394 
395 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
396 inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
397                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
398                                                   CeedScalar *__restrict__ d_v) {
399   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
400   const CeedInt target_node_x = n % P_1D;
401   const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
402   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
403 
404   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
405     const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
406     const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
407 
408     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
409   }
410 }
411 
412 //------------------------------------------------------------------------------
413 // E-vector -> L-vector, full assembly
414 //------------------------------------------------------------------------------
415 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
416 inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
417                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
418   const CeedInt elem_size  = P_1D * P_1D * P_1D;
419   const CeedInt in_comp    = in / elem_size;
420   const CeedInt in_node_x  = in % P_1D;
421   const CeedInt in_node_y  = (in % (P_1D * P_1D)) / P_1D;
422   const CeedInt in_node_z  = (in % elem_size) / (P_1D * P_1D);
423   const CeedInt e_vec_size = elem_size * NUM_COMP;
424 
425   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
426     const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D;
427     for (CeedInt z = 0; z < P_1D; z++) {
428       const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
429 
430       for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
431         const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
432 
433         d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D];
434       }
435     }
436   }
437 }
438 
439 //------------------------------------------------------------------------------
440 // E-vector -> L-vector, Qfunction assembly
441 //------------------------------------------------------------------------------
442 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
443 inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
444                                                       const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
445   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
446     for (CeedInt z = 0; z < Q_1D; z++) {
447       const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D;
448 
449       for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
450         d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D];
451       }
452     }
453   }
454 }
455 
456 //------------------------------------------------------------------------------
457 // E-vector -> L-vector, strided
458 //------------------------------------------------------------------------------
459 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
460 inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
461                                           CeedScalar *__restrict__ d_v) {
462   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
463     for (CeedInt z = 0; z < P_1D; z++) {
464       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
465       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
466 
467       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
468     }
469   }
470 }
471 
472 //------------------------------------------------------------------------------
473 // 3D collocated derivatives computation
474 //------------------------------------------------------------------------------
475 template <int NUM_COMP, int Q_1D, int T_1D>
476 inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
477                                         CeedScalar *__restrict__ r_V) {
478   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
479     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
480       __syncthreads();
481       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
482       __syncthreads();
483       // X derivative
484       r_V[comp + 0 * NUM_COMP] = 0.0;
485       for (CeedInt i = 0; i < Q_1D; i++) {
486         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
487       }
488       // Y derivative
489       r_V[comp + 1 * NUM_COMP] = 0.0;
490       for (CeedInt i = 0; i < Q_1D; i++) {
491         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
492       }
493       // Z derivative
494       r_V[comp + 2 * NUM_COMP] = 0.0;
495       for (CeedInt i = 0; i < Q_1D; i++) {
496         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
497       }
498     }
499   }
500 }
501 
502 //------------------------------------------------------------------------------
503 // 3D collocated derivatives transpose
504 //------------------------------------------------------------------------------
505 template <int NUM_COMP, int Q_1D, int T_1D>
506 inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
507                                                  CeedScalar *__restrict__ r_V) {
508   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
509     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
510       // X derivative
511       __syncthreads();
512       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
513       __syncthreads();
514       for (CeedInt i = 0; i < Q_1D; i++) {
515         r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
516       }
517       // Y derivative
518       __syncthreads();
519       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
520       __syncthreads();
521       for (CeedInt i = 0; i < Q_1D; i++) {
522         r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
523       }
524       // Z derivative
525       for (CeedInt i = 0; i < Q_1D; i++) {
526         r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
527       }
528     }
529   }
530 }
531