1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 /// @file
9 /// Internal header for CUDA shared memory tensor product basis templates
10 #include <ceed/types.h>
11
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15
16 //------------------------------------------------------------------------------
17 // 1D tensor contraction x
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D, int Q_1D>
ContractX1d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)20 inline __device__ void ContractX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
21 __syncthreads();
22 data.slice[data.t_id_x] = *U;
23 __syncthreads();
24 *V = 0.0;
25 if (data.t_id_x < Q_1D) {
26 for (CeedInt i = 0; i < P_1D; i++) {
27 *V += B[i + data.t_id_x * P_1D] * data.slice[i]; // Contract x direction
28 }
29 }
30 }
31
32 //------------------------------------------------------------------------------
33 // 1D transpose tensor contraction x
34 //------------------------------------------------------------------------------
35 template <int NUM_COMP, int P_1D, int Q_1D>
ContractTransposeX1d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)36 inline __device__ void ContractTransposeX1d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
37 __syncthreads();
38 data.slice[data.t_id_x] = *U;
39 __syncthreads();
40 *V = 0.0;
41 if (data.t_id_x < P_1D) {
42 for (CeedInt i = 0; i < Q_1D; i++) {
43 *V += B[data.t_id_x + i * P_1D] * data.slice[i]; // Contract x direction
44 }
45 }
46 }
47
48 //------------------------------------------------------------------------------
49 // 1D interpolate to quadrature points
50 //------------------------------------------------------------------------------
51 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
Interp1d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)52 inline __device__ void Interp1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, CeedScalar *__restrict__ r_V) {
53 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
54 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
55 }
56 }
57
58 //------------------------------------------------------------------------------
59 // 1D interpolate transpose
60 //------------------------------------------------------------------------------
61 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTranspose1d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)62 inline __device__ void InterpTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
63 CeedScalar *__restrict__ r_V) {
64 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
65 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_B, &r_V[comp]);
66 }
67 }
68
69 //------------------------------------------------------------------------------
70 // 1D interpolate to quadrature points, nodes and quadrature points collocated
71 //------------------------------------------------------------------------------
72 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpCollocatedNodes1d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)73 inline __device__ void InterpCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
74 CeedScalar *__restrict__ r_V) {
75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76 r_V[comp] = r_U[comp];
77 }
78 }
79
80 //------------------------------------------------------------------------------
81 // 1D interpolate transpose, nodes and quadrature points collocated
82 //------------------------------------------------------------------------------
83 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeCollocatedNodes1d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)84 inline __device__ void InterpTransposeCollocatedNodes1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
85 CeedScalar *__restrict__ r_V) {
86 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
87 r_V[comp] = r_U[comp];
88 }
89 }
90
91 //------------------------------------------------------------------------------
92 // 1D derivatives at quadrature points
93 //------------------------------------------------------------------------------
94 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
Grad1d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)95 inline __device__ void Grad1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
96 CeedScalar *__restrict__ r_V) {
97 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
98 ContractX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
99 }
100 }
101
102 //------------------------------------------------------------------------------
103 // 1D derivatives transpose
104 //------------------------------------------------------------------------------
105 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTranspose1d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)106 inline __device__ void GradTranspose1d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
107 CeedScalar *__restrict__ r_V) {
108 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
109 ContractTransposeX1d<NUM_COMP, P_1D, Q_1D>(data, &r_U[comp], c_G, &r_V[comp]);
110 }
111 }
112
113 //------------------------------------------------------------------------------
114 // 1D quadrature weights
115 //------------------------------------------------------------------------------
116 template <int P_1D, int Q_1D>
Weight1d(SharedData_Cuda & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)117 inline __device__ void Weight1d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
118 *w = (data.t_id_x < Q_1D) ? q_weight_1d[data.t_id_x] : 0.0;
119 }
120
121 //------------------------------------------------------------------------------
122 // 2D
123 //------------------------------------------------------------------------------
124
125 //------------------------------------------------------------------------------
126 // 2D tensor contraction x
127 //------------------------------------------------------------------------------
128 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractX2d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)129 inline __device__ void ContractX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
130 __syncthreads();
131 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
132 __syncthreads();
133 *V = 0.0;
134 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
135 for (CeedInt i = 0; i < P_1D; i++) {
136 *V += B[i + data.t_id_x * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
137 }
138 }
139 }
140
141 //------------------------------------------------------------------------------
142 // 2D tensor contract y
143 //------------------------------------------------------------------------------
144 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractY2d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)145 inline __device__ void ContractY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
146 __syncthreads();
147 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
148 __syncthreads();
149 *V = 0.0;
150 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
151 for (CeedInt i = 0; i < P_1D; i++) {
152 *V += B[i + data.t_id_y * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
153 }
154 }
155 }
156
157 //------------------------------------------------------------------------------
158 // 2D transpose tensor contract y
159 //------------------------------------------------------------------------------
160 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeY2d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)161 inline __device__ void ContractTransposeY2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
162 __syncthreads();
163 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
164 __syncthreads();
165 *V = 0.0;
166 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
167 for (CeedInt i = 0; i < Q_1D; i++) {
168 *V += B[data.t_id_y + i * P_1D] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
169 }
170 }
171 }
172
173 //------------------------------------------------------------------------------
174 // 2D transpose tensor contract x
175 //------------------------------------------------------------------------------
176 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeX2d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)177 inline __device__ void ContractTransposeX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
178 __syncthreads();
179 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
180 __syncthreads();
181 *V = 0.0;
182 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
183 for (CeedInt i = 0; i < Q_1D; i++) {
184 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
185 }
186 }
187 }
188
189 //------------------------------------------------------------------------------
190 // 2D transpose tensor contract and add x
191 //------------------------------------------------------------------------------
192 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddX2d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)193 inline __device__ void ContractTransposeAddX2d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
194 __syncthreads();
195 data.slice[data.t_id_x + data.t_id_y * T_1D] = *U;
196 __syncthreads();
197 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
198 for (CeedInt i = 0; i < Q_1D; i++) {
199 *V += B[data.t_id_x + i * P_1D] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
200 }
201 }
202 }
203
204 //------------------------------------------------------------------------------
205 // 2D interpolate to quadrature points
206 //------------------------------------------------------------------------------
207 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensor2d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)208 inline __device__ void InterpTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
209 CeedScalar *__restrict__ r_V) {
210 CeedScalar r_t[1];
211 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
212 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
213 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
214 }
215 }
216
217 //------------------------------------------------------------------------------
218 // 2D interpolate transpose
219 //------------------------------------------------------------------------------
220 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensor2d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)221 inline __device__ void InterpTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
222 CeedScalar *__restrict__ r_V) {
223 CeedScalar r_t[1];
224 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
225 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
226 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
227 }
228 }
229
230 //------------------------------------------------------------------------------
231 // 2D interpolate to quadrature points, nodes and quadrature points collocated
232 //------------------------------------------------------------------------------
233 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensorCollocatedNodes2d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)234 inline __device__ void InterpTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
235 CeedScalar *__restrict__ r_V) {
236 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
237 r_V[comp] = r_U[comp];
238 }
239 }
240
241 //------------------------------------------------------------------------------
242 // 2D interpolate transpose, nodes and quadrature points collocated
243 //------------------------------------------------------------------------------
244 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensorCollocatedNodes2d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)245 inline __device__ void InterpTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
246 CeedScalar *__restrict__ r_V) {
247 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
248 r_V[comp] = r_U[comp];
249 }
250 }
251
252 //------------------------------------------------------------------------------
253 // 2D derivatives at quadrature points
254 //------------------------------------------------------------------------------
255 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensor2d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)256 inline __device__ void GradTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
257 CeedScalar *__restrict__ r_V) {
258 CeedScalar r_t[1];
259 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
260 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, r_t);
261 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp + 0 * NUM_COMP]);
262 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_B, r_t);
263 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp + 1 * NUM_COMP]);
264 }
265 }
266
267 //------------------------------------------------------------------------------
268 // 2D derivatives transpose
269 //------------------------------------------------------------------------------
270 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor2d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)271 inline __device__ void GradTransposeTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
272 CeedScalar *__restrict__ r_V) {
273 CeedScalar r_t[1];
274 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
275 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_B, r_t);
276 ContractTransposeX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_G, &r_V[comp]);
277 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, r_t);
278 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t, c_B, &r_V[comp]);
279 }
280 }
281
282 //------------------------------------------------------------------------------
283 // 2D derivatives at quadrature points, nodes and quadrature points collocated
284 //------------------------------------------------------------------------------
285 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocatedNodes2d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)286 inline __device__ void GradTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
287 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
288 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
289 ContractX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 0 * NUM_COMP]);
290 ContractY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp], c_G, &r_V[comp + 1 * NUM_COMP]);
291 }
292 }
293
294 //------------------------------------------------------------------------------
295 // 2D derivatives transpose, nodes and quadrature points collocated
296 //------------------------------------------------------------------------------
297 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocatedNodes2d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)298 inline __device__ void GradTransposeTensorCollocatedNodes2d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
299 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
300 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
301 ContractTransposeY2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 1 * NUM_COMP], c_G, &r_V[comp]);
302 ContractTransposeAddX2d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp + 0 * NUM_COMP], c_G, &r_V[comp]);
303 }
304 }
305
306 //------------------------------------------------------------------------------
307 // 2D quadrature weights
308 //------------------------------------------------------------------------------
309 template <int P_1D, int Q_1D>
WeightTensor2d(SharedData_Cuda & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)310 inline __device__ void WeightTensor2d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
311 *w = (data.t_id_x < Q_1D && data.t_id_y < Q_1D) ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
312 }
313
314 //------------------------------------------------------------------------------
315 // 3D
316 //------------------------------------------------------------------------------
317
318 //------------------------------------------------------------------------------
319 // 3D tensor contract x
320 //------------------------------------------------------------------------------
321 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractX3d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)322 inline __device__ void ContractX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
323 CeedScalar r_B[P_1D];
324 for (CeedInt i = 0; i < P_1D; i++) {
325 r_B[i] = B[i + data.t_id_x * P_1D];
326 }
327
328 for (CeedInt k = 0; k < P_1D; k++) {
329 __syncthreads();
330 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
331 __syncthreads();
332 V[k] = 0.0;
333 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
334 for (CeedInt i = 0; i < P_1D; i++) {
335 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
336 }
337 }
338 }
339 }
340
341 //------------------------------------------------------------------------------
342 // 3D tensor contract y
343 //------------------------------------------------------------------------------
344 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractY3d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)345 inline __device__ void ContractY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
346 CeedScalar r_B[P_1D];
347 for (CeedInt i = 0; i < P_1D; i++) {
348 r_B[i] = B[i + data.t_id_y * P_1D];
349 }
350
351 for (CeedInt k = 0; k < P_1D; k++) {
352 __syncthreads();
353 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
354 __syncthreads();
355 V[k] = 0.0;
356 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
357 for (CeedInt i = 0; i < P_1D; i++) {
358 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
359 }
360 }
361 }
362 }
363
364 //------------------------------------------------------------------------------
365 // 3D tensor contract z
366 //------------------------------------------------------------------------------
367 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractZ3d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)368 inline __device__ void ContractZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
369 for (CeedInt k = 0; k < Q_1D; k++) {
370 V[k] = 0.0;
371 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
372 for (CeedInt i = 0; i < P_1D; i++) {
373 V[k] += B[i + k * P_1D] * U[i]; // Contract z direction
374 }
375 }
376 }
377 }
378
379 //------------------------------------------------------------------------------
380 // 3D transpose tensor contract z
381 //------------------------------------------------------------------------------
382 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeZ3d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)383 inline __device__ void ContractTransposeZ3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
384 for (CeedInt k = 0; k < P_1D; k++) {
385 V[k] = 0.0;
386 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
387 for (CeedInt i = 0; i < Q_1D; i++) {
388 V[k] += B[k + i * P_1D] * U[i]; // Contract z direction
389 }
390 }
391 }
392 }
393
394 //------------------------------------------------------------------------------
395 // 3D transpose tensor contract y
396 //------------------------------------------------------------------------------
397 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeY3d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)398 inline __device__ void ContractTransposeY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
399 CeedScalar r_B[Q_1D];
400 for (CeedInt i = 0; i < Q_1D; i++) {
401 r_B[i] = B[data.t_id_y + i * P_1D];
402 }
403
404 for (CeedInt k = 0; k < P_1D; k++) {
405 __syncthreads();
406 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
407 __syncthreads();
408 V[k] = 0.0;
409 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
410 for (CeedInt i = 0; i < Q_1D; i++) {
411 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
412 }
413 }
414 }
415 }
416
417 //------------------------------------------------------------------------------
418 // 3D transpose tensor contract y
419 //------------------------------------------------------------------------------
420 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddY3d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)421 inline __device__ void ContractTransposeAddY3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
422 CeedScalar r_B[Q_1D];
423 for (CeedInt i = 0; i < Q_1D; i++) {
424 r_B[i] = B[data.t_id_y + i * P_1D];
425 }
426
427 for (CeedInt k = 0; k < P_1D; k++) {
428 __syncthreads();
429 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
430 __syncthreads();
431 if (data.t_id_x < Q_1D && data.t_id_y < P_1D) {
432 for (CeedInt i = 0; i < Q_1D; i++) {
433 V[k] += r_B[i] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction
434 }
435 }
436 }
437 }
438
439 //------------------------------------------------------------------------------
440 // 3D transpose tensor contract x
441 //------------------------------------------------------------------------------
442 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeX3d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)443 inline __device__ void ContractTransposeX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
444 CeedScalar r_B[Q_1D];
445 for (CeedInt i = 0; i < Q_1D; i++) {
446 r_B[i] = B[data.t_id_x + i * P_1D];
447 }
448
449 for (CeedInt k = 0; k < P_1D; k++) {
450 __syncthreads();
451 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
452 __syncthreads();
453 V[k] = 0.0;
454 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
455 for (CeedInt i = 0; i < Q_1D; i++) {
456 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
457 }
458 }
459 }
460 }
461
462 //------------------------------------------------------------------------------
463 // 3D transpose tensor contract add x
464 //------------------------------------------------------------------------------
465 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
ContractTransposeAddX3d(SharedData_Cuda & data,const CeedScalar * U,const CeedScalar * B,CeedScalar * V)466 inline __device__ void ContractTransposeAddX3d(SharedData_Cuda &data, const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
467 CeedScalar r_B[Q_1D];
468 for (CeedInt i = 0; i < Q_1D; i++) {
469 r_B[i] = B[data.t_id_x + i * P_1D];
470 }
471
472 for (CeedInt k = 0; k < P_1D; k++) {
473 __syncthreads();
474 data.slice[data.t_id_x + data.t_id_y * T_1D] = U[k];
475 __syncthreads();
476 if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
477 for (CeedInt i = 0; i < Q_1D; i++) {
478 V[k] += r_B[i] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction
479 }
480 }
481 }
482 }
483
484 //------------------------------------------------------------------------------
485 // 3D interpolate to quadrature points
486 //------------------------------------------------------------------------------
487 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensor3d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)488 inline __device__ void InterpTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
489 CeedScalar *__restrict__ r_V) {
490 CeedScalar r_t1[T_1D];
491 CeedScalar r_t2[T_1D];
492 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
493 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
494 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
495 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D]);
496 }
497 }
498
499 //------------------------------------------------------------------------------
500 // 3D interpolate transpose
501 //------------------------------------------------------------------------------
502 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensor3d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)503 inline __device__ void InterpTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
504 CeedScalar *__restrict__ r_V) {
505 CeedScalar r_t1[T_1D];
506 CeedScalar r_t2[T_1D];
507 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
508 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D], c_B, r_t1);
509 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
510 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
511 }
512 }
513
514 //------------------------------------------------------------------------------
515 // 3D interpolate to quadrature points, nodes and quadrature points collocated
516 //------------------------------------------------------------------------------
517 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTensorCollocatedNodes3d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)518 inline __device__ void InterpTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
519 CeedScalar *__restrict__ r_V) {
520 for (CeedInt i = 0; i < Q_1D; i++) {
521 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
522 r_V[i + comp * Q_1D] = r_U[i + comp * P_1D];
523 }
524 }
525 }
526
527 //------------------------------------------------------------------------------
528 // 3D interpolate transpose, nodes and quadrature points collocated
529 //------------------------------------------------------------------------------
530 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
InterpTransposeTensorCollocatedNodes3d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,CeedScalar * __restrict__ r_V)531 inline __device__ void InterpTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
532 CeedScalar *__restrict__ r_V) {
533 for (CeedInt i = 0; i < Q_1D; i++) {
534 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
535 r_V[i + comp * P_1D] = r_U[i + comp * Q_1D];
536 }
537 }
538 }
539
540 //------------------------------------------------------------------------------
541 // 3D derivatives at quadrature points
542 //------------------------------------------------------------------------------
543 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensor3d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)544 inline __device__ void GradTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
545 CeedScalar *__restrict__ r_V) {
546 CeedScalar r_t1[T_1D];
547 CeedScalar r_t2[T_1D];
548 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
549 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, r_t1);
550 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
551 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
552 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
553 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
554 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
555 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
556 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
557 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
558 }
559 }
560
561 //------------------------------------------------------------------------------
562 // 3D derivatives transpose
563 //------------------------------------------------------------------------------
564 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensor3d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)565 inline __device__ void GradTransposeTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
566 CeedScalar *__restrict__ r_V) {
567 CeedScalar r_t1[T_1D];
568 CeedScalar r_t2[T_1D];
569 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
570 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_B, r_t1);
571 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
572 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_G, &r_V[comp * P_1D]);
573 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_B, r_t1);
574 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_G, r_t2);
575 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
576 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t1);
577 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
578 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
579 }
580 }
581
582 //------------------------------------------------------------------------------
583 // 3D derivatives at quadrature points
584 //------------------------------------------------------------------------------
585 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocated3d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)586 inline __device__ void GradTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B, const CeedScalar *c_G,
587 CeedScalar *__restrict__ r_V) {
588 CeedScalar r_t1[T_1D];
589 CeedScalar r_t2[T_1D];
590 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
591 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_B, r_t1);
592 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
593 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
594 ContractX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
595 ContractY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
596 ContractZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, r_t1, c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
597 }
598 }
599
600 //------------------------------------------------------------------------------
601 // 3D derivatives transpose
602 //------------------------------------------------------------------------------
603 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocated3d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)604 inline __device__ void GradTransposeTensorCollocated3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
605 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
606 CeedScalar r_t1[T_1D];
607 CeedScalar r_t2[T_1D];
608 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
609 ContractTransposeZ3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, r_t2);
610 ContractTransposeAddY3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, r_t2);
611 ContractTransposeAddX3d<NUM_COMP, Q_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, r_t2);
612 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, r_t1);
613 ContractTransposeY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t1, c_B, r_t2);
614 ContractTransposeX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, r_t2, c_B, &r_V[comp * P_1D]);
615 }
616 }
617
618 //------------------------------------------------------------------------------
619 // 3D derivatives at quadrature points, nodes and quadrature points collocated
620 //------------------------------------------------------------------------------
621 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTensorCollocatedNodes3d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)622 inline __device__ void GradTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
623 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
624 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
625 ContractX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 0 * NUM_COMP * Q_1D]);
626 ContractY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 1 * NUM_COMP * Q_1D]);
627 ContractZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * P_1D], c_G, &r_V[comp * Q_1D + 2 * NUM_COMP * Q_1D]);
628 }
629 }
630
631 //------------------------------------------------------------------------------
632 // 3D derivatives transpose, nodes and quadrature points collocated
633 //------------------------------------------------------------------------------
634 template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
GradTransposeTensorCollocatedNodes3d(SharedData_Cuda & data,const CeedScalar * __restrict__ r_U,const CeedScalar * c_B,const CeedScalar * c_G,CeedScalar * __restrict__ r_V)635 inline __device__ void GradTransposeTensorCollocatedNodes3d(SharedData_Cuda &data, const CeedScalar *__restrict__ r_U, const CeedScalar *c_B,
636 const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
637 for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
638 ContractTransposeZ3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 2 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
639 ContractTransposeAddY3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 1 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
640 ContractTransposeAddX3d<NUM_COMP, P_1D, Q_1D, T_1D>(data, &r_U[comp * Q_1D + 0 * NUM_COMP * Q_1D], c_G, &r_V[comp * P_1D]);
641 }
642 }
643
644 //------------------------------------------------------------------------------
645 // 3D quadrature weights
646 //------------------------------------------------------------------------------
647 template <int P_1D, int Q_1D>
WeightTensor3d(SharedData_Cuda & data,const CeedScalar * __restrict__ q_weight_1d,CeedScalar * w)648 inline __device__ void WeightTensor3d(SharedData_Cuda &data, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *w) {
649 const bool quad = (data.t_id_x < Q_1D && data.t_id_y < Q_1D);
650 const CeedScalar pw = quad ? q_weight_1d[data.t_id_x] * q_weight_1d[data.t_id_y] : 0.0;
651 for (CeedInt q = 0; q < Q_1D; q++) {
652 w[q] = quad ? pw * q_weight_1d[q] : 0.0;
653 }
654 }
655