xref: /libCEED/include/ceed/jit-source/cuda/cuda-gen-templates.h (revision d3d5610df460248361ab17f2fa259b4661019597)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA backend macro and type definitions for JiT source
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // Load matrices for basis actions
14 //------------------------------------------------------------------------------
15 template <int P, int Q>
16 inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
17   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
18 }
19 
20 //------------------------------------------------------------------------------
21 // AtPoints
22 //------------------------------------------------------------------------------
23 
24 //------------------------------------------------------------------------------
25 // L-vector -> single point
26 //------------------------------------------------------------------------------
27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
28 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
29                                  const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
30   const CeedInt ind = indices[p + elem * NUM_PTS];
31 
32   for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
33     r_u[comp] = d_u[ind + comp * COMP_STRIDE];
34   }
35 }
36 
37 //------------------------------------------------------------------------------
38 // Single point -> L-vector
39 //------------------------------------------------------------------------------
40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
41 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
42                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
43   if (p < points_in_elem) {
44     const CeedInt ind = indices[p + elem * NUM_PTS];
45 
46     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
47       d_u[ind + comp * COMP_STRIDE] += r_u[comp];
48     }
49   }
50 }
51 
52 //------------------------------------------------------------------------------
53 // 1D
54 //------------------------------------------------------------------------------
55 
56 //------------------------------------------------------------------------------
57 // Set E-vector value
58 //------------------------------------------------------------------------------
59 template <int NUM_COMP, int P_1D>
60 inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
61   const CeedInt target_comp = n / P_1D;
62   const CeedInt target_node = n % P_1D;
63 
64   if (data.t_id_x == target_node) {
65     r_v[target_comp] = value;
66   }
67 }
68 
69 //------------------------------------------------------------------------------
70 // L-vector -> E-vector, offsets provided
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
73 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
74                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
75   if (data.t_id_x < P_1D) {
76     const CeedInt node = data.t_id_x;
77     const CeedInt ind  = indices[node + elem * P_1D];
78 
79     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
80   }
81 }
82 
83 //------------------------------------------------------------------------------
84 // L-vector -> E-vector, strided
85 //------------------------------------------------------------------------------
86 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
87 inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
88                                          CeedScalar *__restrict__ r_u) {
89   if (data.t_id_x < P_1D) {
90     const CeedInt node = data.t_id_x;
91     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
92 
93     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
94   }
95 }
96 
97 //------------------------------------------------------------------------------
98 // E-vector -> L-vector, offsets provided
99 //------------------------------------------------------------------------------
100 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
101 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
102                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
103   if (data.t_id_x < P_1D) {
104     const CeedInt node = data.t_id_x;
105     const CeedInt ind  = indices[node + elem * P_1D];
106 
107     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
108   }
109 }
110 
111 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
112 inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
113                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
114                                                   CeedScalar *__restrict__ d_v) {
115   const CeedInt target_comp = n / P_1D;
116   const CeedInt target_node = n % P_1D;
117 
118   if (data.t_id_x == target_node) {
119     const CeedInt ind = indices[target_node + elem * P_1D];
120 
121     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
122   }
123 }
124 
125 //------------------------------------------------------------------------------
126 // E-vector -> L-vector, full assembly
127 //------------------------------------------------------------------------------
128 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
129 inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
130                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
131   const CeedInt in_comp    = in / P_1D;
132   const CeedInt in_node    = in % P_1D;
133   const CeedInt e_vec_size = P_1D * NUM_COMP;
134 
135   if (data.t_id_x < P_1D) {
136     const CeedInt out_node = data.t_id_x;
137 
138     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
139       d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp];
140     }
141   }
142 }
143 
144 //------------------------------------------------------------------------------
145 // E-vector -> L-vector, strided
146 //------------------------------------------------------------------------------
147 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
148 inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
149                                           CeedScalar *__restrict__ d_v) {
150   if (data.t_id_x < P_1D) {
151     const CeedInt node = data.t_id_x;
152     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
153 
154     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
155   }
156 }
157 
158 //------------------------------------------------------------------------------
159 // 2D
160 //------------------------------------------------------------------------------
161 
162 //------------------------------------------------------------------------------
163 // Set E-vector value
164 //------------------------------------------------------------------------------
165 template <int NUM_COMP, int P_1D>
166 inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
167   const CeedInt target_comp   = n / (P_1D * P_1D);
168   const CeedInt target_node_x = n % P_1D;
169   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
170 
171   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
172     r_v[target_comp] = value;
173   }
174 }
175 
176 //------------------------------------------------------------------------------
177 // L-vector -> E-vector, offsets provided
178 //------------------------------------------------------------------------------
179 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
180 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
181                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
182   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
183     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
184     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
185 
186     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
187   }
188 }
189 
190 //------------------------------------------------------------------------------
191 // L-vector -> E-vector, strided
192 //------------------------------------------------------------------------------
193 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
194 inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
195                                          CeedScalar *__restrict__ r_u) {
196   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
197     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
198     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
199 
200     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
201   }
202 }
203 
204 //------------------------------------------------------------------------------
205 // E-vector -> L-vector, offsets provided
206 //------------------------------------------------------------------------------
207 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
208 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
209                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
210   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
211     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
212     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
213 
214     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
215   }
216 }
217 
218 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
219 inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
220                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
221                                                   CeedScalar *__restrict__ d_v) {
222   const CeedInt target_comp   = n / (P_1D * P_1D);
223   const CeedInt target_node_x = n % P_1D;
224   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
225 
226   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
227     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
228     const CeedInt ind  = indices[node + elem * P_1D * P_1D];
229 
230     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
231   }
232 }
233 
234 //------------------------------------------------------------------------------
235 // E-vector -> L-vector, full assembly
236 //------------------------------------------------------------------------------
237 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
238 inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
239                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
240   const CeedInt elem_size  = P_1D * P_1D;
241   const CeedInt in_comp    = in / elem_size;
242   const CeedInt in_node_x  = in % P_1D;
243   const CeedInt in_node_y  = (in % elem_size) / P_1D;
244   const CeedInt e_vec_size = elem_size * NUM_COMP;
245 
246   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
247     const CeedInt in_node  = in_node_x + in_node_y * P_1D;
248     const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D;
249 
250     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
251       const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
252 
253       d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp];
254     }
255   }
256 }
257 
258 //------------------------------------------------------------------------------
259 // E-vector -> L-vector, strided
260 //------------------------------------------------------------------------------
261 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
262 inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
263                                           CeedScalar *__restrict__ d_v) {
264   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
265     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
266     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
267 
268     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
269   }
270 }
271 
272 //------------------------------------------------------------------------------
273 // 3D
274 //------------------------------------------------------------------------------
275 
276 //------------------------------------------------------------------------------
277 // Set E-vector value
278 //------------------------------------------------------------------------------
279 template <int NUM_COMP, int P_1D>
280 inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
281   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
282   const CeedInt target_node_x = n % P_1D;
283   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
284   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
285 
286   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
287     r_v[target_node_z + target_comp * P_1D] = value;
288   }
289 }
290 
291 //------------------------------------------------------------------------------
292 // L-vector -> E-vector, offsets provided
293 //------------------------------------------------------------------------------
294 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
295 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
296                                           const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
297   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
298     for (CeedInt z = 0; z < P_1D; z++) {
299       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
300       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
301 
302       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
303     }
304   }
305 }
306 
307 //------------------------------------------------------------------------------
308 // L-vector -> E-vector, strided
309 //------------------------------------------------------------------------------
310 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
311 inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
312                                          CeedScalar *__restrict__ r_u) {
313   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
314     for (CeedInt z = 0; z < P_1D; z++) {
315       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
316       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
317 
318       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
319     }
320   }
321 }
322 
323 //------------------------------------------------------------------------------
324 // E-vector -> Q-vector, offests provided
325 //------------------------------------------------------------------------------
326 template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
327 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
328                                                const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
329                                                CeedScalar *__restrict__ r_u) {
330   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
331     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
332     const CeedInt ind  = indices[node + elem * Q_1D * Q_1D * Q_1D];
333 
334     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
335   }
336 }
337 
338 //------------------------------------------------------------------------------
339 // E-vector -> Q-vector, strided
340 //------------------------------------------------------------------------------
341 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
342 inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
343                                               CeedScalar *__restrict__ r_u) {
344   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
345     const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
346     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
347 
348     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
349   }
350 }
351 
352 //------------------------------------------------------------------------------
353 // E-vector -> L-vector, offsets provided
354 //------------------------------------------------------------------------------
355 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
356 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
357                                            const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
358   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
359     for (CeedInt z = 0; z < P_1D; z++) {
360       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
361       const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
362 
363       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
364     }
365   }
366 }
367 
368 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
369 inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
370                                                   const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
371                                                   CeedScalar *__restrict__ d_v) {
372   const CeedInt target_comp   = n / (P_1D * P_1D * P_1D);
373   const CeedInt target_node_x = n % P_1D;
374   const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
375   const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
376 
377   if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
378     const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
379     const CeedInt ind  = indices[node + elem * P_1D * P_1D * P_1D];
380 
381     atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
382   }
383 }
384 
385 //------------------------------------------------------------------------------
386 // E-vector -> L-vector, full assembly
387 //------------------------------------------------------------------------------
388 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
389 inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
390                                                     const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
391   const CeedInt elem_size  = P_1D * P_1D * P_1D;
392   const CeedInt in_comp    = in / elem_size;
393   const CeedInt in_node_x  = in % P_1D;
394   const CeedInt in_node_y  = (in % (P_1D * P_1D)) / P_1D;
395   const CeedInt in_node_z  = (in % elem_size) / (P_1D * P_1D);
396   const CeedInt e_vec_size = elem_size * NUM_COMP;
397 
398   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
399     const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D;
400     for (CeedInt z = 0; z < P_1D; z++) {
401       const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
402 
403       for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
404         const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
405 
406         d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D];
407       }
408     }
409   }
410 }
411 
412 //------------------------------------------------------------------------------
413 // E-vector -> L-vector, strided
414 //------------------------------------------------------------------------------
415 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
416 inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
417                                           CeedScalar *__restrict__ d_v) {
418   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
419     for (CeedInt z = 0; z < P_1D; z++) {
420       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
421       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
422 
423       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
424     }
425   }
426 }
427 
428 //------------------------------------------------------------------------------
429 // 3D collocated derivatives computation
430 //------------------------------------------------------------------------------
431 template <int NUM_COMP, int Q_1D, int T_1D>
432 inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
433                                         CeedScalar *__restrict__ r_V) {
434   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
435     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
436       __syncthreads();
437       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
438       __syncthreads();
439       // X derivative
440       r_V[comp + 0 * NUM_COMP] = 0.0;
441       for (CeedInt i = 0; i < Q_1D; i++) {
442         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
443       }
444       // Y derivative
445       r_V[comp + 1 * NUM_COMP] = 0.0;
446       for (CeedInt i = 0; i < Q_1D; i++) {
447         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
448       }
449       // Z derivative
450       r_V[comp + 2 * NUM_COMP] = 0.0;
451       for (CeedInt i = 0; i < Q_1D; i++) {
452         r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
453       }
454     }
455   }
456 }
457 
458 //------------------------------------------------------------------------------
459 // 3D collocated derivatives transpose
460 //------------------------------------------------------------------------------
461 template <int NUM_COMP, int Q_1D, int T_1D>
462 inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
463                                                  CeedScalar *__restrict__ r_V) {
464   if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
465     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
466       __syncthreads();
467       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
468       __syncthreads();
469       // X derivative
470       for (CeedInt i = 0; i < Q_1D; i++) {
471         r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
472       }
473       // Y derivative
474       __syncthreads();
475       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
476       __syncthreads();
477       for (CeedInt i = 0; i < Q_1D; i++) {
478         r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
479       }
480       // Z derivative
481       for (CeedInt i = 0; i < Q_1D; i++) {
482         r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
483       }
484     }
485   }
486 }
487