1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for CUDA backend macro and type definitions for JiT source
10 #include <ceed/types.h>
11
12 //------------------------------------------------------------------------------
13 // Load matrices for basis actions
14 //------------------------------------------------------------------------------
15 template <int P, int Q>
LoadMatrix(SharedData_Cuda & data,const CeedScalar * __restrict__ d_B,CeedScalar * B)16 inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
18 }
19
20 //------------------------------------------------------------------------------
21 // AtPoints
22 //------------------------------------------------------------------------------
23
24 //------------------------------------------------------------------------------
25 // L-vector -> single point
26 //------------------------------------------------------------------------------
27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
ReadPoint(SharedData_Cuda & data,const CeedInt elem,const CeedInt p,const CeedInt points_in_elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * r_u)28 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
29 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
30 const CeedInt ind = indices[p + elem * NUM_PTS];
31
32 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
33 r_u[comp] = d_u[ind + comp * COMP_STRIDE];
34 }
35 }
36
37 //------------------------------------------------------------------------------
38 // Single point -> L-vector
39 //------------------------------------------------------------------------------
40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
WritePoint(SharedData_Cuda & data,const CeedInt elem,const CeedInt p,const CeedInt points_in_elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_u,CeedScalar * d_u)41 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
42 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
43 if (p < points_in_elem) {
44 const CeedInt ind = indices[p + elem * NUM_PTS];
45
46 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
47 d_u[ind + comp * COMP_STRIDE] += r_u[comp];
48 }
49 }
50 }
51
52 //------------------------------------------------------------------------------
53 // 1D
54 //------------------------------------------------------------------------------
55
56 //------------------------------------------------------------------------------
57 // Set E-vector value
58 //------------------------------------------------------------------------------
59 template <int NUM_COMP, int P_1D>
SetEVecStandard1d_Single(SharedData_Cuda & data,const CeedInt n,const CeedScalar value,CeedScalar * __restrict__ r_v)60 inline __device__ void SetEVecStandard1d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
61 const CeedInt target_comp = n / P_1D;
62 const CeedInt target_node = n % P_1D;
63
64 if (data.t_id_x == target_node) {
65 r_v[target_comp] = value;
66 }
67 }
68
69 //------------------------------------------------------------------------------
70 // L-vector -> E-vector, offsets provided
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
ReadLVecStandard1d(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)73 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
74 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
75 if (data.t_id_x < P_1D) {
76 const CeedInt node = data.t_id_x;
77 const CeedInt ind = indices[node + elem * P_1D];
78
79 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
80 }
81 }
82
83 //------------------------------------------------------------------------------
84 // L-vector -> E-vector, strided
85 //------------------------------------------------------------------------------
86 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadLVecStrided1d(SharedData_Cuda & data,const CeedInt elem,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)87 inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
88 CeedScalar *__restrict__ r_u) {
89 if (data.t_id_x < P_1D) {
90 const CeedInt node = data.t_id_x;
91 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
92
93 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
94 }
95 }
96
97 //------------------------------------------------------------------------------
98 // E-vector -> L-vector, offsets provided
99 //------------------------------------------------------------------------------
100 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard1d(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)101 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
102 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
103 if (data.t_id_x < P_1D) {
104 const CeedInt node = data.t_id_x;
105 const CeedInt ind = indices[node + elem * P_1D];
106
107 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
108 }
109 }
110
111 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard1d_Single(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt n,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)112 inline __device__ void WriteLVecStandard1d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
113 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
114 CeedScalar *__restrict__ d_v) {
115 const CeedInt target_comp = n / P_1D;
116 const CeedInt target_node = n % P_1D;
117
118 if (data.t_id_x == target_node) {
119 const CeedInt ind = indices[target_node + elem * P_1D];
120
121 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
122 }
123 }
124
125 //------------------------------------------------------------------------------
126 // E-vector -> L-vector, full assembly
127 //------------------------------------------------------------------------------
128 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard1d_Assembly(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt in,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)129 inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
130 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
131 const CeedInt in_comp = in / P_1D;
132 const CeedInt in_node = in % P_1D;
133 const CeedInt e_vec_size = P_1D * NUM_COMP;
134
135 if (data.t_id_x < P_1D) {
136 const CeedInt out_node = data.t_id_x;
137
138 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
139 d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp];
140 }
141 }
142 }
143
144 //------------------------------------------------------------------------------
145 // E-vector -> L-vector, Qfunction assembly
146 //------------------------------------------------------------------------------
147 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
WriteLVecStandard1d_QFAssembly(SharedData_Cuda & data,const CeedInt num_elem,const CeedInt elem,const CeedInt input_offset,const CeedInt output_offset,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)148 inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
149 const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
150 if (data.t_id_x < Q_1D) {
151 const CeedInt ind = data.t_id_x + elem * Q_1D;
152
153 for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
154 d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp];
155 }
156 }
157 }
158
159 //------------------------------------------------------------------------------
160 // E-vector -> L-vector, strided
161 //------------------------------------------------------------------------------
162 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
WriteLVecStrided1d(SharedData_Cuda & data,const CeedInt elem,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)163 inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
164 CeedScalar *__restrict__ d_v) {
165 if (data.t_id_x < P_1D) {
166 const CeedInt node = data.t_id_x;
167 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
168
169 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
170 }
171 }
172
173 //------------------------------------------------------------------------------
174 // 2D
175 //------------------------------------------------------------------------------
176
177 //------------------------------------------------------------------------------
178 // Set E-vector value
179 //------------------------------------------------------------------------------
180 template <int NUM_COMP, int P_1D>
SetEVecStandard2d_Single(SharedData_Cuda & data,const CeedInt n,const CeedScalar value,CeedScalar * __restrict__ r_v)181 inline __device__ void SetEVecStandard2d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
182 const CeedInt target_comp = n / (P_1D * P_1D);
183 const CeedInt target_node_x = n % P_1D;
184 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
185
186 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
187 r_v[target_comp] = value;
188 }
189 }
190
191 //------------------------------------------------------------------------------
192 // L-vector -> E-vector, offsets provided
193 //------------------------------------------------------------------------------
194 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
ReadLVecStandard2d(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)195 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
196 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
197 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
198 const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
199 const CeedInt ind = indices[node + elem * P_1D * P_1D];
200
201 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
202 }
203 }
204
205 //------------------------------------------------------------------------------
206 // L-vector -> E-vector, strided
207 //------------------------------------------------------------------------------
208 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadLVecStrided2d(SharedData_Cuda & data,const CeedInt elem,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)209 inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
210 CeedScalar *__restrict__ r_u) {
211 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
212 const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
213 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
214
215 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
216 }
217 }
218
219 //------------------------------------------------------------------------------
220 // E-vector -> L-vector, offsets provided
221 //------------------------------------------------------------------------------
222 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard2d(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)223 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
224 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
225 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
226 const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
227 const CeedInt ind = indices[node + elem * P_1D * P_1D];
228
229 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
230 }
231 }
232
233 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard2d_Single(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt n,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)234 inline __device__ void WriteLVecStandard2d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
235 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
236 CeedScalar *__restrict__ d_v) {
237 const CeedInt target_comp = n / (P_1D * P_1D);
238 const CeedInt target_node_x = n % P_1D;
239 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
240
241 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
242 const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
243 const CeedInt ind = indices[node + elem * P_1D * P_1D];
244
245 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
246 }
247 }
248
249 //------------------------------------------------------------------------------
250 // E-vector -> L-vector, full assembly
251 //------------------------------------------------------------------------------
252 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard2d_Assembly(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt in,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)253 inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
254 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
255 const CeedInt elem_size = P_1D * P_1D;
256 const CeedInt in_comp = in / elem_size;
257 const CeedInt in_node_x = in % P_1D;
258 const CeedInt in_node_y = (in % elem_size) / P_1D;
259 const CeedInt e_vec_size = elem_size * NUM_COMP;
260
261 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
262 const CeedInt in_node = in_node_x + in_node_y * P_1D;
263 const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D;
264
265 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
266 const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
267
268 d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp];
269 }
270 }
271 }
272
273 //------------------------------------------------------------------------------
274 // E-vector -> L-vector, Qfunction assembly
275 //------------------------------------------------------------------------------
276 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
WriteLVecStandard2d_QFAssembly(SharedData_Cuda & data,const CeedInt num_elem,const CeedInt elem,const CeedInt input_offset,const CeedInt output_offset,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)277 inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
278 const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
279 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
280 const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D;
281
282 for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
283 d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp];
284 }
285 }
286 }
287
288 //------------------------------------------------------------------------------
289 // E-vector -> L-vector, strided
290 //------------------------------------------------------------------------------
291 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
WriteLVecStrided2d(SharedData_Cuda & data,const CeedInt elem,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)292 inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
293 CeedScalar *__restrict__ d_v) {
294 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
295 const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
296 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
297
298 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
299 }
300 }
301
302 //------------------------------------------------------------------------------
303 // 3D
304 //------------------------------------------------------------------------------
305
306 //------------------------------------------------------------------------------
307 // Set E-vector value
308 //------------------------------------------------------------------------------
309 template <int NUM_COMP, int P_1D>
SetEVecStandard3d_Single(SharedData_Cuda & data,const CeedInt n,const CeedScalar value,CeedScalar * __restrict__ r_v)310 inline __device__ void SetEVecStandard3d_Single(SharedData_Cuda &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
311 const CeedInt target_comp = n / (P_1D * P_1D * P_1D);
312 const CeedInt target_node_x = n % P_1D;
313 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
314 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
315
316 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
317 r_v[target_node_z + target_comp * P_1D] = value;
318 }
319 }
320
321 //------------------------------------------------------------------------------
322 // L-vector -> E-vector, offsets provided
323 //------------------------------------------------------------------------------
324 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
ReadLVecStandard3d(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)325 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
326 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
327 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
328 for (CeedInt z = 0; z < P_1D; z++) {
329 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
330 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D];
331
332 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
333 }
334 }
335 }
336
337 //------------------------------------------------------------------------------
338 // L-vector -> E-vector, strided
339 //------------------------------------------------------------------------------
340 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadLVecStrided3d(SharedData_Cuda & data,const CeedInt elem,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)341 inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
342 CeedScalar *__restrict__ r_u) {
343 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
344 for (CeedInt z = 0; z < P_1D; z++) {
345 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
346 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
347
348 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
349 }
350 }
351 }
352
353 //------------------------------------------------------------------------------
354 // E-vector -> Q-vector, offests provided
355 //------------------------------------------------------------------------------
356 template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
ReadEVecSliceStandard3d(SharedData_Cuda & data,const CeedInt nquads,const CeedInt elem,const CeedInt q,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)357 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
358 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
359 CeedScalar *__restrict__ r_u) {
360 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
361 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
362 const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D];
363
364 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
365 }
366 }
367
368 //------------------------------------------------------------------------------
369 // E-vector -> Q-vector, strided
370 //------------------------------------------------------------------------------
371 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadEVecSliceStrided3d(SharedData_Cuda & data,const CeedInt elem,const CeedInt q,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)372 inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
373 CeedScalar *__restrict__ r_u) {
374 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
375 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
376 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
377
378 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
379 }
380 }
381
382 //------------------------------------------------------------------------------
383 // E-vector -> L-vector, offsets provided
384 //------------------------------------------------------------------------------
385 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard3d(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)386 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
387 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
388 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
389 for (CeedInt z = 0; z < P_1D; z++) {
390 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
391 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D];
392
393 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
394 }
395 }
396 }
397
398 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard3d_Single(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt n,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)399 inline __device__ void WriteLVecStandard3d_Single(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
400 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
401 CeedScalar *__restrict__ d_v) {
402 const CeedInt target_comp = n / (P_1D * P_1D * P_1D);
403 const CeedInt target_node_x = n % P_1D;
404 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
405 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
406
407 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
408 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
409 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D];
410
411 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
412 }
413 }
414
415 //------------------------------------------------------------------------------
416 // E-vector -> L-vector, full assembly
417 //------------------------------------------------------------------------------
418 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard3d_Assembly(SharedData_Cuda & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt in,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)419 inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
420 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
421 const CeedInt elem_size = P_1D * P_1D * P_1D;
422 const CeedInt in_comp = in / elem_size;
423 const CeedInt in_node_x = in % P_1D;
424 const CeedInt in_node_y = (in % (P_1D * P_1D)) / P_1D;
425 const CeedInt in_node_z = (in % elem_size) / (P_1D * P_1D);
426 const CeedInt e_vec_size = elem_size * NUM_COMP;
427
428 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
429 const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D;
430 for (CeedInt z = 0; z < P_1D; z++) {
431 const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
432
433 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
434 const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
435
436 d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D];
437 }
438 }
439 }
440 }
441
442 //------------------------------------------------------------------------------
443 // E-vector -> L-vector, Qfunction assembly
444 //------------------------------------------------------------------------------
445 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
WriteLVecStandard3d_QFAssembly(SharedData_Cuda & data,const CeedInt num_elem,const CeedInt elem,const CeedInt input_offset,const CeedInt output_offset,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)446 inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Cuda &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
447 const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
448 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
449 for (CeedInt z = 0; z < Q_1D; z++) {
450 const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D;
451
452 for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
453 d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D];
454 }
455 }
456 }
457 }
458
459 //------------------------------------------------------------------------------
460 // E-vector -> L-vector, strided
461 //------------------------------------------------------------------------------
462 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
WriteLVecStrided3d(SharedData_Cuda & data,const CeedInt elem,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)463 inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
464 CeedScalar *__restrict__ d_v) {
465 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
466 for (CeedInt z = 0; z < P_1D; z++) {
467 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
468 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
469
470 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
471 }
472 }
473 }
474
475 //------------------------------------------------------------------------------
476 // 3D collocated derivatives computation
477 //------------------------------------------------------------------------------
478 template <int NUM_COMP, int Q_1D, int T_1D>
GradColloSlice3d(SharedData_Cuda & data,const CeedInt q,const CeedScalar * __restrict__ r_U,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)479 inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
480 CeedScalar *__restrict__ r_V) {
481 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
482 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
483 __syncthreads();
484 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
485 __syncthreads();
486 // X derivative
487 r_V[comp + 0 * NUM_COMP] = 0.0;
488 for (CeedInt i = 0; i < Q_1D; i++) {
489 r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
490 }
491 // Y derivative
492 r_V[comp + 1 * NUM_COMP] = 0.0;
493 for (CeedInt i = 0; i < Q_1D; i++) {
494 r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
495 }
496 // Z derivative
497 r_V[comp + 2 * NUM_COMP] = 0.0;
498 for (CeedInt i = 0; i < Q_1D; i++) {
499 r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
500 }
501 }
502 }
503 }
504
505 //------------------------------------------------------------------------------
506 // 3D collocated derivatives transpose
507 //------------------------------------------------------------------------------
508 template <int NUM_COMP, int Q_1D, int T_1D>
GradColloSliceTranspose3d(SharedData_Cuda & data,const CeedInt q,const CeedScalar * __restrict__ r_U,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)509 inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
510 CeedScalar *__restrict__ r_V) {
511 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
512 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
513 __syncthreads();
514 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
515 __syncthreads();
516 // X derivative
517 for (CeedInt i = 0; i < Q_1D; i++) {
518 r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
519 }
520 // Y derivative
521 __syncthreads();
522 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
523 __syncthreads();
524 for (CeedInt i = 0; i < Q_1D; i++) {
525 r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
526 }
527 // Z derivative
528 for (CeedInt i = 0; i < Q_1D; i++) {
529 r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
530 }
531 }
532 }
533 }
534