1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for HIP backend macro and type definitions for JiT source
10 #include <ceed/types.h>
11
12 //------------------------------------------------------------------------------
13 // Load matrices for basis actions
14 //------------------------------------------------------------------------------
15 template <int P, int Q>
LoadMatrix(SharedData_Hip & data,const CeedScalar * __restrict__ d_B,CeedScalar * B)16 inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
18 }
19
20 //------------------------------------------------------------------------------
21 // AtPoints
22 //------------------------------------------------------------------------------
23
24 //------------------------------------------------------------------------------
25 // L-vector -> single point
26 //------------------------------------------------------------------------------
27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
ReadPoint(SharedData_Hip & data,const CeedInt elem,const CeedInt p,const CeedInt points_in_elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * r_u)28 inline __device__ void ReadPoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
29 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
30 const CeedInt ind = indices[p + elem * NUM_PTS];
31
32 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
33 r_u[comp] = d_u[ind + comp * COMP_STRIDE];
34 }
35 }
36
37 //------------------------------------------------------------------------------
38 // Single point -> L-vector
39 //------------------------------------------------------------------------------
40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS>
WritePoint(SharedData_Hip & data,const CeedInt elem,const CeedInt p,const CeedInt points_in_elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_u,CeedScalar * d_u)41 inline __device__ void WritePoint(SharedData_Hip &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
42 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) {
43 if (p < points_in_elem) {
44 const CeedInt ind = indices[p + elem * NUM_PTS];
45
46 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
47 d_u[ind + comp * COMP_STRIDE] += r_u[comp];
48 }
49 }
50 }
51
52 //------------------------------------------------------------------------------
53 // 1D
54 //------------------------------------------------------------------------------
55
56 //------------------------------------------------------------------------------
57 // Set E-vector value
58 //------------------------------------------------------------------------------
59 template <int NUM_COMP, int P_1D>
SetEVecStandard1d_Single(SharedData_Hip & data,const CeedInt n,const CeedScalar value,CeedScalar * __restrict__ r_v)60 inline __device__ void SetEVecStandard1d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
61 const CeedInt target_comp = n / P_1D;
62 const CeedInt target_node = n % P_1D;
63
64 if (data.t_id_x == target_node) {
65 r_v[target_comp] = value;
66 }
67 }
68
69 //------------------------------------------------------------------------------
70 // L-vector -> E-vector, offsets provided
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
ReadLVecStandard1d(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)73 inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
74 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
75 if (data.t_id_x < P_1D) {
76 const CeedInt node = data.t_id_x;
77 const CeedInt ind = indices[node + elem * P_1D];
78
79 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
80 }
81 }
82
83 //------------------------------------------------------------------------------
84 // L-vector -> E-vector, strided
85 //------------------------------------------------------------------------------
86 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadLVecStrided1d(SharedData_Hip & data,const CeedInt elem,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)87 inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
88 if (data.t_id_x < P_1D) {
89 const CeedInt node = data.t_id_x;
90 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
91
92 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
93 }
94 }
95
96 //------------------------------------------------------------------------------
97 // E-vector -> L-vector, offsets provided
98 //------------------------------------------------------------------------------
99 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard1d(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)100 inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
101 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
102 if (data.t_id_x < P_1D) {
103 const CeedInt node = data.t_id_x;
104 const CeedInt ind = indices[node + elem * P_1D];
105
106 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
107 }
108 }
109
110 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard1d_Single(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt n,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)111 inline __device__ void WriteLVecStandard1d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
112 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
113 CeedScalar *__restrict__ d_v) {
114 const CeedInt target_comp = n / P_1D;
115 const CeedInt target_node = n % P_1D;
116
117 if (data.t_id_x == target_node) {
118 const CeedInt ind = indices[target_node + elem * P_1D];
119
120 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
121 }
122 }
123
124 //------------------------------------------------------------------------------
125 // E-vector -> L-vector, full assembly
126 //------------------------------------------------------------------------------
127 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard1d_Assembly(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt in,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)128 inline __device__ void WriteLVecStandard1d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
129 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
130 const CeedInt in_comp = in / P_1D;
131 const CeedInt in_node = in % P_1D;
132 const CeedInt e_vec_size = P_1D * NUM_COMP;
133
134 if (data.t_id_x < P_1D) {
135 const CeedInt out_node = data.t_id_x;
136
137 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
138 d_v[elem * e_vec_size * e_vec_size + (in_comp * NUM_COMP + comp) * P_1D * P_1D + out_node * P_1D + in_node] += r_v[comp];
139 }
140 }
141 }
142
143 //------------------------------------------------------------------------------
144 // E-vector -> L-vector, Qfunction assembly
145 //------------------------------------------------------------------------------
146 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
WriteLVecStandard1d_QFAssembly(SharedData_Hip & data,const CeedInt num_elem,const CeedInt elem,const CeedInt input_offset,const CeedInt output_offset,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)147 inline __device__ void WriteLVecStandard1d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
148 const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
149 if (data.t_id_x < Q_1D) {
150 const CeedInt ind = data.t_id_x + elem * Q_1D;
151
152 for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
153 d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * num_elem)] = r_v[comp];
154 }
155 }
156 }
157
158 //------------------------------------------------------------------------------
159 // E-vector -> L-vector, strided
160 //------------------------------------------------------------------------------
161 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
WriteLVecStrided1d(SharedData_Hip & data,const CeedInt elem,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)162 inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
163 CeedScalar *__restrict__ d_v) {
164 if (data.t_id_x < P_1D) {
165 const CeedInt node = data.t_id_x;
166 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
167
168 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
169 }
170 }
171
172 //------------------------------------------------------------------------------
173 // 2D
174 //------------------------------------------------------------------------------
175
176 //------------------------------------------------------------------------------
177 // Set E-vector value
178 //------------------------------------------------------------------------------
179 template <int NUM_COMP, int P_1D>
SetEVecStandard2d_Single(SharedData_Hip & data,const CeedInt n,const CeedScalar value,CeedScalar * __restrict__ r_v)180 inline __device__ void SetEVecStandard2d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
181 const CeedInt target_comp = n / (P_1D * P_1D);
182 const CeedInt target_node_x = n % P_1D;
183 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
184
185 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
186 r_v[target_comp] = value;
187 }
188 }
189
190 //------------------------------------------------------------------------------
191 // L-vector -> E-vector, offsets provided
192 //------------------------------------------------------------------------------
193 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
ReadLVecStandard2d(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)194 inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
195 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
196 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
197 const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
198 const CeedInt ind = indices[node + elem * P_1D * P_1D];
199
200 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
201 }
202 }
203
204 //------------------------------------------------------------------------------
205 // L-vector -> E-vector, strided
206 //------------------------------------------------------------------------------
207 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadLVecStrided2d(SharedData_Hip & data,const CeedInt elem,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)208 inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
209 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
210 const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
211 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
212
213 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
214 }
215 }
216
217 //------------------------------------------------------------------------------
218 // E-vector -> L-vector, offsets provided
219 //------------------------------------------------------------------------------
220 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard2d(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)221 inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
222 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
223 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
224 const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
225 const CeedInt ind = indices[node + elem * P_1D * P_1D];
226
227 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
228 }
229 }
230
231 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard2d_Single(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt n,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)232 inline __device__ void WriteLVecStandard2d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
233 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
234 CeedScalar *__restrict__ d_v) {
235 const CeedInt target_comp = n / (P_1D * P_1D);
236 const CeedInt target_node_x = n % P_1D;
237 const CeedInt target_node_y = (n % (P_1D * P_1D)) / P_1D;
238
239 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
240 const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
241 const CeedInt ind = indices[node + elem * P_1D * P_1D];
242
243 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_comp]);
244 }
245 }
246
247 //------------------------------------------------------------------------------
248 // E-vector -> L-vector, full assembly
249 //------------------------------------------------------------------------------
250 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard2d_Assembly(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt in,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)251 inline __device__ void WriteLVecStandard2d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
252 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
253 const CeedInt elem_size = P_1D * P_1D;
254 const CeedInt in_comp = in / elem_size;
255 const CeedInt in_node_x = in % P_1D;
256 const CeedInt in_node_y = (in % elem_size) / P_1D;
257 const CeedInt e_vec_size = elem_size * NUM_COMP;
258
259 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
260 const CeedInt in_node = in_node_x + in_node_y * P_1D;
261 const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D;
262
263 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
264 const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
265
266 d_v[elem * e_vec_size * e_vec_size + index] += r_v[comp];
267 }
268 }
269 }
270
271 //------------------------------------------------------------------------------
272 // E-vector -> L-vector, Qfunction assembly
273 //------------------------------------------------------------------------------
274 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
WriteLVecStandard2d_QFAssembly(SharedData_Hip & data,const CeedInt num_elem,const CeedInt elem,const CeedInt input_offset,const CeedInt output_offset,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)275 inline __device__ void WriteLVecStandard2d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
276 const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
277 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
278 const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D) + elem * Q_1D * Q_1D;
279
280 for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
281 d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * num_elem)] = r_v[comp];
282 }
283 }
284 }
285
286 //------------------------------------------------------------------------------
287 // E-vector -> L-vector, strided
288 //------------------------------------------------------------------------------
289 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
WriteLVecStrided2d(SharedData_Hip & data,const CeedInt elem,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)290 inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
291 CeedScalar *__restrict__ d_v) {
292 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
293 const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
294 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
295
296 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
297 }
298 }
299
300 //------------------------------------------------------------------------------
301 // 3D
302 //------------------------------------------------------------------------------
303
304 //------------------------------------------------------------------------------
305 // Set E-vector value
306 //------------------------------------------------------------------------------
307 template <int NUM_COMP, int P_1D>
SetEVecStandard3d_Single(SharedData_Hip & data,const CeedInt n,const CeedScalar value,CeedScalar * __restrict__ r_v)308 inline __device__ void SetEVecStandard3d_Single(SharedData_Hip &data, const CeedInt n, const CeedScalar value, CeedScalar *__restrict__ r_v) {
309 const CeedInt target_comp = n / (P_1D * P_1D * P_1D);
310 const CeedInt target_node_x = n % P_1D;
311 const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
312 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
313
314 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
315 r_v[target_node_z + target_comp * P_1D] = value;
316 }
317 }
318
319 //------------------------------------------------------------------------------
320 // L-vector -> E-vector, offsets provided
321 //------------------------------------------------------------------------------
322 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
ReadLVecStandard3d(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)323 inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
324 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
325 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
326 for (CeedInt z = 0; z < P_1D; z++) {
327 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
328 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D];
329
330 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp];
331 }
332 }
333 }
334
335 //------------------------------------------------------------------------------
336 // L-vector -> E-vector, strided
337 //------------------------------------------------------------------------------
338 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadLVecStrided3d(SharedData_Hip & data,const CeedInt elem,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)339 inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
340 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
341 for (CeedInt z = 0; z < P_1D; z++) {
342 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
343 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
344
345 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP];
346 }
347 }
348 }
349
350 //------------------------------------------------------------------------------
351 // E-vector -> Q-vector, offests provided
352 //------------------------------------------------------------------------------
353 template <int NUM_COMP, int COMP_STRIDE, int Q_1D>
ReadEVecSliceStandard3d(SharedData_Hip & data,const CeedInt nquads,const CeedInt elem,const CeedInt q,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)354 inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
355 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u,
356 CeedScalar *__restrict__ r_u) {
357 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
358 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
359 const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D];
360
361 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
362 }
363 }
364
365 //------------------------------------------------------------------------------
366 // E-vector -> Q-vector, strided
367 //------------------------------------------------------------------------------
368 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
ReadEVecSliceStrided3d(SharedData_Hip & data,const CeedInt elem,const CeedInt q,const CeedScalar * __restrict__ d_u,CeedScalar * __restrict__ r_u)369 inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
370 CeedScalar *__restrict__ r_u) {
371 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
372 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D;
373 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
374
375 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
376 }
377 }
378
379 //------------------------------------------------------------------------------
380 // E-vector -> L-vector, offsets provided
381 //------------------------------------------------------------------------------
382 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard3d(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)383 inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
384 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
385 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
386 for (CeedInt z = 0; z < P_1D; z++) {
387 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
388 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D];
389
390 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]);
391 }
392 }
393 }
394
395 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard3d_Single(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt n,const CeedInt * __restrict__ indices,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)396 inline __device__ void WriteLVecStandard3d_Single(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt n,
397 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_v,
398 CeedScalar *__restrict__ d_v) {
399 const CeedInt target_comp = n / (P_1D * P_1D * P_1D);
400 const CeedInt target_node_x = n % P_1D;
401 const CeedInt target_node_y = ((n % (P_1D * P_1D * P_1D)) / P_1D) % P_1D;
402 const CeedInt target_node_z = (n % (P_1D * P_1D * P_1D)) / (P_1D * P_1D);
403
404 if (data.t_id_x == target_node_x && data.t_id_y == target_node_y) {
405 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + target_node_z * P_1D * P_1D;
406 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D];
407
408 atomicAdd(&d_v[ind + COMP_STRIDE * target_comp], r_v[target_node_z + target_comp * P_1D]);
409 }
410 }
411
412 //------------------------------------------------------------------------------
413 // E-vector -> L-vector, full assembly
414 //------------------------------------------------------------------------------
415 template <int NUM_COMP, int COMP_STRIDE, int P_1D>
WriteLVecStandard3d_Assembly(SharedData_Hip & data,const CeedInt num_nodes,const CeedInt elem,const CeedInt in,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)416 inline __device__ void WriteLVecStandard3d_Assembly(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt in,
417 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
418 const CeedInt elem_size = P_1D * P_1D * P_1D;
419 const CeedInt in_comp = in / elem_size;
420 const CeedInt in_node_x = in % P_1D;
421 const CeedInt in_node_y = (in % (P_1D * P_1D)) / P_1D;
422 const CeedInt in_node_z = (in % elem_size) / (P_1D * P_1D);
423 const CeedInt e_vec_size = elem_size * NUM_COMP;
424
425 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
426 const CeedInt in_node = in_node_x + in_node_y * P_1D + in_node_z * P_1D * P_1D;
427 for (CeedInt z = 0; z < P_1D; z++) {
428 const CeedInt out_node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
429
430 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
431 const CeedInt index = (in_comp * NUM_COMP + comp) * elem_size * elem_size + out_node * elem_size + in_node;
432
433 d_v[elem * e_vec_size * e_vec_size + index] += r_v[z + comp * P_1D];
434 }
435 }
436 }
437 }
438
439 //------------------------------------------------------------------------------
440 // E-vector -> L-vector, Qfunction assembly
441 //------------------------------------------------------------------------------
442 template <int NUM_COMP_OUT, int NUM_COMP_FIELD, int Q_1D>
WriteLVecStandard3d_QFAssembly(SharedData_Hip & data,const CeedInt num_elem,const CeedInt elem,const CeedInt input_offset,const CeedInt output_offset,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)443 inline __device__ void WriteLVecStandard3d_QFAssembly(SharedData_Hip &data, const CeedInt num_elem, const CeedInt elem, const CeedInt input_offset,
444 const CeedInt output_offset, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
445 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
446 for (CeedInt z = 0; z < Q_1D; z++) {
447 const CeedInt ind = (data.t_id_x + data.t_id_y * Q_1D + z * Q_1D * Q_1D) + elem * Q_1D * Q_1D * Q_1D;
448
449 for (CeedInt comp = 0; comp < NUM_COMP_FIELD; comp++) {
450 d_v[ind + (input_offset * NUM_COMP_OUT + output_offset + comp) * (Q_1D * Q_1D * Q_1D * num_elem)] = r_v[z + comp * Q_1D];
451 }
452 }
453 }
454 }
455
456 //------------------------------------------------------------------------------
457 // E-vector -> L-vector, strided
458 //------------------------------------------------------------------------------
459 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
WriteLVecStrided3d(SharedData_Hip & data,const CeedInt elem,const CeedScalar * __restrict__ r_v,CeedScalar * __restrict__ d_v)460 inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
461 CeedScalar *__restrict__ d_v) {
462 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
463 for (CeedInt z = 0; z < P_1D; z++) {
464 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
465 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
466
467 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D];
468 }
469 }
470 }
471
472 //------------------------------------------------------------------------------
473 // 3D collocated derivatives computation
474 //------------------------------------------------------------------------------
475 template <int NUM_COMP, int Q_1D, int T_1D>
GradColloSlice3d(SharedData_Hip & data,const CeedInt q,const CeedScalar * __restrict__ r_U,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)476 inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
477 CeedScalar *__restrict__ r_V) {
478 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
479 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
480 __syncthreads();
481 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
482 __syncthreads();
483 // X derivative
484 r_V[comp + 0 * NUM_COMP] = 0.0;
485 for (CeedInt i = 0; i < Q_1D; i++) {
486 r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D];
487 }
488 // Y derivative
489 r_V[comp + 1 * NUM_COMP] = 0.0;
490 for (CeedInt i = 0; i < Q_1D; i++) {
491 r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D];
492 }
493 // Z derivative
494 r_V[comp + 2 * NUM_COMP] = 0.0;
495 for (CeedInt i = 0; i < Q_1D; i++) {
496 r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
497 }
498 }
499 }
500 }
501
502 //------------------------------------------------------------------------------
503 // 3D collocated derivatives transpose
504 //------------------------------------------------------------------------------
505 template <int NUM_COMP, int Q_1D, int T_1D>
GradColloSliceTranspose3d(SharedData_Hip & data,const CeedInt q,const CeedScalar * __restrict__ r_U,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)506 inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
507 CeedScalar *__restrict__ r_V) {
508 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
509 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
510 // X derivative
511 __syncthreads();
512 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
513 __syncthreads();
514 for (CeedInt i = 0; i < Q_1D; i++) {
515 r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
516 }
517 // Y derivative
518 __syncthreads();
519 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
520 __syncthreads();
521 for (CeedInt i = 0; i < Q_1D; i++) {
522 r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
523 }
524 // Z derivative
525 for (CeedInt i = 0; i < Q_1D; i++) {
526 r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];
527 }
528 }
529 }
530 }
531