xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision 0816752e297ae5dd4074175fee440caf6c69c9f1)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA backend macro and type definitions for JiT source
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Load matrices for basis actions
14 //------------------------------------------------------------------------------
15 template <int P, int Q>
16 inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
17   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
18 }
19 
20 //------------------------------------------------------------------------------
21 // AtPoints
22 //------------------------------------------------------------------------------
23 
24 //------------------------------------------------------------------------------
25 // L-vector -> single point
26 //------------------------------------------------------------------------------
27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
28 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
29                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
30   const CeedInt ind = indices[p + elem * NUM_PTS];
31 
32   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
33     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
34   }
35 }
36 
37 //------------------------------------------------------------------------------
38 // Single point -> L-vector
39 //------------------------------------------------------------------------------
40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
41 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
42                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
43   if (p < points_in_elem) {
44     const CeedInt ind = indices[p + elem * NUM_PTS];
45 
46     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
47       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
48     }
49   }
50 }
51 
52 //------------------------------------------------------------------------------
53 // 1D
54 //------------------------------------------------------------------------------
55 
56 //------------------------------------------------------------------------------
57 // Set E-vector value
58 //------------------------------------------------------------------------------
59 template <int NUM_COMP, int P_1D>
60 inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
61   const CeedInt target_comp = n / P_1D;
62   const CeedInt target_node = n % P_1D;
63 
64   if (data.t_id_x == target_node) {
65     r_v[target_comp] = value;
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // L-vector -> E-vector, offsets provided
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
73 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
74                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
75   if (data.t_id_x < P_1D) {
76     const CeedInt node = data.t_id_x;
77     const CeedInt ind  = indices[node + elem * P_1D];
78 
79     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
80   }
81 }
82 
83 //------------------------------------------------------------------------------
84 // L-vector -> E-vector, strided
85 //------------------------------------------------------------------------------
86 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
87 inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
88                                          CeedScalar *__restrict__ r_u) {
89   if (data.t_id_x < P_1D) {
90     const CeedInt node = data.t_id_x;
91     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
92 
93     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
94   }
95 }
96 
97 //------------------------------------------------------------------------------
98 // E-vector -> L-vector, offsets provided
99 //------------------------------------------------------------------------------
100 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
101 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
102                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
103   if (data.t_id_x < P_1D) {
104     const CeedInt node = data.t_id_x;
105     const CeedInt ind  = indices[node + elem * P_1D];
106 
107     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
108   }
109 }
110 
111 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
112 inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
113                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
114                                                   CeedScalar *__restrict__ d_v) {
115   const CeedInt target_comp = n / P_1D;
116   const CeedInt target_node = n % P_1D;
117 
118   if (data.t_id_x == target_node) {
119     const CeedInt ind = indices[target_node + elem * P_1D];
120 
121     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
122   }
123 }
124 
125 //------------------------------------------------------------------------------
126 // E-vector -> L-vector, full assembly
127 //------------------------------------------------------------------------------
128 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
129 inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
130                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
131   const CeedInt in_comp    = in / P_1D;
132   const CeedInt in_node    = in % P_1D;
133   const CeedInt e_vec_size = P_1D * NUM_COMP;
134 
135   if (data.t_id_x < P_1D) {
136     const CeedInt out_node = data.t_id_x;
137 
138     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
139       d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp];
140     }
141   }
142 }
143 
144 //------------------------------------------------------------------------------
145 // E-vector -> L-vector, Qfunction assembly
146 //------------------------------------------------------------------------------
147 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
148 inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
149                                                       const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
150   if (data.t_id_x < Q_1D) {
151     const CeedInt ind = data.t_id_x + elem * Q_1D;
152 
153     for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
154       d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp];
155     }
156   }
157 }
158 
159 //------------------------------------------------------------------------------
160 // E-vector -> L-vector, strided
161 //------------------------------------------------------------------------------
162 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
163 inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
164                                           CeedScalar *__restrict__ d_v) {
165   if (data.t_id_x < P_1D) {
166     const CeedInt node = data.t_id_x;
167     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
168 
169     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
170   }
171 }
172 
173 //------------------------------------------------------------------------------
174 // 2D
175 //------------------------------------------------------------------------------
176 
177 //------------------------------------------------------------------------------
178 // Set E-vector value
179 //------------------------------------------------------------------------------
180 template <int NUM_COMP, int P_1D>
181 inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
182   const CeedInt target_comp   = n / (P_1D * P_1D);
183   const CeedInt target_node_x = n % P_1D;
184   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
185 
186   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
187     r_v[target_comp] = value;
188   }
189 }
190 
191 //------------------------------------------------------------------------------
192 // L-vector -> E-vector, offsets provided
193 //------------------------------------------------------------------------------
194 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
195 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
196                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
197   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
198     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
199     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
200 
201     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
202   }
203 }
204 
205 //------------------------------------------------------------------------------
206 // L-vector -> E-vector, strided
207 //------------------------------------------------------------------------------
208 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
209 inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
210                                          CeedScalar *__restrict__ r_u) {
211   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
212     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
213     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
214 
215     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
216   }
217 }
218 
219 //------------------------------------------------------------------------------
220 // E-vector -> L-vector, offsets provided
221 //------------------------------------------------------------------------------
222 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
223 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
224                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
225   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
226     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
227     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
228 
229     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
230   }
231 }
232 
233 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
234 inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
235                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
236                                                   CeedScalar *__restrict__ d_v) {
237   const CeedInt target_comp   = n / (P_1D * P_1D);
238   const CeedInt target_node_x = n % P_1D;
239   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
240 
241   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
242     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
243     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
244 
245     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
246   }
247 }
248 
249 //------------------------------------------------------------------------------
250 // E-vector -> L-vector, full assembly
251 //------------------------------------------------------------------------------
252 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
253 inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
254                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
255   const CeedInt elem_size  = P_1D * P_1D;
256   const CeedInt in_comp    = in / elem_size;
257   const CeedInt in_node_x  = in % P_1D;
258   const CeedInt in_node_y  = (in % elem_size) / P_1D;
259   const CeedInt e_vec_size = elem_size * NUM_COMP;
260 
261   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
262     const CeedInt in_node  = in_node_x + in_node_y * P_1D;
263     const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D;
264 
265     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
266       const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
267 
268       d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp];
269     }
270   }
271 }
272 
273 //------------------------------------------------------------------------------
274 // E-vector -> L-vector, Qfunction assembly
275 //------------------------------------------------------------------------------
276 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
277 inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
278                                                       const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
279   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
280     const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D;
281 
282     for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
283       d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp];
284     }
285   }
286 }
287 
288 //------------------------------------------------------------------------------
289 // E-vector -> L-vector, strided
290 //------------------------------------------------------------------------------
291 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
292 inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
293                                           CeedScalar *__restrict__ d_v) {
294   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
295     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
296     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
297 
298     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
299   }
300 }
301 
302 //------------------------------------------------------------------------------
303 // 3D
304 //------------------------------------------------------------------------------
305 
306 //------------------------------------------------------------------------------
307 // Set E-vector value
308 //------------------------------------------------------------------------------
309 template <int NUM_COMP, int P_1D>
310 inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
311   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
312   const CeedInt target_node_x = n % P_1D;
313   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
314   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
315 
316   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
317     r_v[target_node_z + target_comp * P_1D] = value;
318   }
319 }
320 
321 //------------------------------------------------------------------------------
322 // L-vector -> E-vector, offsets provided
323 //------------------------------------------------------------------------------
324 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
325 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
326                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
327   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
328     for (CeedInt z = 0; z < P_1D; z++) {
329       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
330       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
331 
332       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
333     }
334   }
335 }
336 
337 //------------------------------------------------------------------------------
338 // L-vector -> E-vector, strided
339 //------------------------------------------------------------------------------
340 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
341 inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
342                                          CeedScalar *__restrict__ r_u) {
343   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
344     for (CeedInt z = 0; z < P_1D; z++) {
345       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
346       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
347 
348       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
349     }
350   }
351 }
352 
353 //------------------------------------------------------------------------------
354 // E-vector -> Q-vector, offests provided
355 //------------------------------------------------------------------------------
356 template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
357 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
358                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
359                                                CeedScalar *__restrict__ r_u) {
360   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
361     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
362     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
363 
364     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
365   }
366 }
367 
368 //------------------------------------------------------------------------------
369 // E-vector -> Q-vector, strided
370 //------------------------------------------------------------------------------
371 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
372 inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
373                                               CeedScalar *__restrict__ r_u) {
374   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
375     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
376     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
377 
378     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
379   }
380 }
381 
382 //------------------------------------------------------------------------------
383 // E-vector -> L-vector, offsets provided
384 //------------------------------------------------------------------------------
385 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
386 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
387                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
388   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
389     for (CeedInt z = 0; z < P_1D; z++) {
390       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
391       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
392 
393       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
394     }
395   }
396 }
397 
398 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
399 inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
400                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
401                                                   CeedScalar *__restrict__ d_v) {
402   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
403   const CeedInt target_node_x = n % P_1D;
404   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
405   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
406 
407   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
408     const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
409     const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
410 
411     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
412   }
413 }
414 
415 //------------------------------------------------------------------------------
416 // E-vector -> L-vector, full assembly
417 //------------------------------------------------------------------------------
418 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
419 inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
420                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
421   const CeedInt elem_size  = P_1D * P_1D * P_1D;
422   const CeedInt in_comp    = in / elem_size;
423   const CeedInt in_node_x  = in % P_1D;
424   const CeedInt in_node_y  = (in % (P_1D * P_1D)) / P_1D;
425   const CeedInt in_node_z  = (in % elem_size) / (P_1D * P_1D);
426   const CeedInt e_vec_size = elem_size * NUM_COMP;
427 
428   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
429     const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D;
430     for (CeedInt z = 0; z < P_1D; z++) {
431       const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
432 
433       for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
434         const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
435 
436         d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D];
437       }
438     }
439   }
440 }
441 
442 //------------------------------------------------------------------------------
443 // E-vector -> L-vector, Qfunction assembly
444 //------------------------------------------------------------------------------
445 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
446 inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
447                                                       const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
448   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
449     for (CeedInt z = 0; z < Q_1D; z++) {
450       const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D;
451 
452       for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
453         d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D];
454       }
455     }
456   }
457 }
458 
459 //------------------------------------------------------------------------------
460 // E-vector -> L-vector, strided
461 //------------------------------------------------------------------------------
462 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
463 inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
464                                           CeedScalar *__restrict__ d_v) {
465   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
466     for (CeedInt z = 0; z < P_1D; z++) {
467       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
468       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
469 
470       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
471     }
472   }
473 }
474 
475 //------------------------------------------------------------------------------
476 // 3D collocated derivatives computation
477 //------------------------------------------------------------------------------
478 template <int NUM_COMP, int Q_1D, int T_1D>
479 inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
480                                         CeedScalar *__restrict__ r_V) {
481   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
482     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
483       __syncthreads();
484       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
485       __syncthreads();
486       // X derivative
487       r_V[comp + 0 * NUM_COMP] = 0.0;
488       for (CeedInt i = 0; i < Q_1D; i++) {
489         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
490       }
491       // Y derivative
492       r_V[comp + 1 * NUM_COMP] = 0.0;
493       for (CeedInt i = 0; i < Q_1D; i++) {
494         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
495       }
496       // Z derivative
497       r_V[comp + 2 * NUM_COMP] = 0.0;
498       for (CeedInt i = 0; i < Q_1D; i++) {
499         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
500       }
501     }
502   }
503 }
504 
505 //------------------------------------------------------------------------------
506 // 3D collocated derivatives transpose
507 //------------------------------------------------------------------------------
508 template <int NUM_COMP, int Q_1D, int T_1D>
509 inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
510                                                  CeedScalar *__restrict__ r_V) {
511   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
512     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
513       __syncthreads();
514       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
515       __syncthreads();
516       // X derivative
517       for (CeedInt i = 0; i < Q_1D; i++) {
518         r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
519       }
520       // Y derivative
521       __syncthreads();
522       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
523       __syncthreads();
524       for (CeedInt i = 0; i < Q_1D; i++) {
525         r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
526       }
527       // Z derivative
528       for (CeedInt i = 0; i < Q_1D; i++) {
529         r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
530       }
531     }
532   }
533 }
534