xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h (revision 290fc47b5baa6f633b0072e77241e8667e9a8675)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA tensor product basis with AtPoints evaluation
10 #include <ceed/types.h>
11 
12 #include "cuda-shared-basis-read-write-templates.h"
13 #include "cuda-shared-basis-tensor-at-points-templates.h"
14 #include "cuda-shared-basis-tensor-templates.h"
15 
16 //------------------------------------------------------------------------------
17 // Tensor Basis Kernels AtPoints
18 //------------------------------------------------------------------------------
19 
20 //------------------------------------------------------------------------------
21 // Interp
22 //------------------------------------------------------------------------------
23 extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
24                                           const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
25   extern __shared__ CeedScalar slice[];
26 
27   SharedData_Cuda data;
28   data.t_id_x = threadIdx.x;
29   data.t_id_y = threadIdx.y;
30   data.t_id_z = threadIdx.z;
31   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
32   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
33 
34   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
35   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
36   CeedScalar r_V[BASIS_NUM_COMP];
37 
38   // Apply basis element by element
39   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
40     // Map to coefficients
41     if (BASIS_DIM == 1) {
42       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
43       Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
44     } else if (BASIS_DIM == 2) {
45       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
46       InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
47     } else if (BASIS_DIM == 3) {
48       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
49                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
50       InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
51     }
52 
53     // Map to points
54     InterpAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
55                                                                          &d_V[elem * BASIS_NUM_PTS]);
56   }
57 }
58 
59 extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
60                                                    const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
61                                                    const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
62   extern __shared__ CeedScalar slice[];
63 
64   SharedData_Cuda data;
65   data.t_id_x = threadIdx.x;
66   data.t_id_y = threadIdx.y;
67   data.t_id_z = threadIdx.z;
68   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
69   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
70 
71   CeedScalar r_U[BASIS_NUM_COMP];
72   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
73   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
74 
75   // Apply basis element by element
76   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
77     // Map from points
78     InterpTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
79                                                                                   &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
80 
81     // Map from coefficients
82     if (BASIS_DIM == 1) {
83       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
84       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
85     } else if (BASIS_DIM == 2) {
86       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
87       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
88     } else if (BASIS_DIM == 3) {
89       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
90       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
91                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
92     }
93   }
94 }
95 
96 //------------------------------------------------------------------------------
97 // Grad
98 //------------------------------------------------------------------------------
99 extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
100                                         const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
101   extern __shared__ CeedScalar slice[];
102 
103   SharedData_Cuda data;
104   data.t_id_x = threadIdx.x;
105   data.t_id_y = threadIdx.y;
106   data.t_id_z = threadIdx.z;
107   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
108   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
109 
110   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
111   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
112   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
113 
114   // Apply basis element by element
115   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
116     // Map to coefficients
117     if (BASIS_DIM == 1) {
118       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
119       Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
120     } else if (BASIS_DIM == 2) {
121       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
122       InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
123     } else if (BASIS_DIM == 3) {
124       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
125                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
126       InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
127     }
128 
129     // Map to points
130     GradAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
131                                                                        &d_V[elem * BASIS_NUM_PTS]);
132   }
133 }
134 
135 extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
136                                                  const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
137                                                  const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
138   extern __shared__ CeedScalar slice[];
139 
140   SharedData_Cuda data;
141   data.t_id_x = threadIdx.x;
142   data.t_id_y = threadIdx.y;
143   data.t_id_z = threadIdx.z;
144   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
145   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
146 
147   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
148   CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
149   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
150 
151   // Apply basis element by element
152   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
153     // Map from points
154     GradTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
155                                                                                 &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
156 
157     // Map from coefficients
158     if (BASIS_DIM == 1) {
159       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
160       SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
161     } else if (BASIS_DIM == 2) {
162       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
163       SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
164     } else if (BASIS_DIM == 3) {
165       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
166       SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
167                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
168     }
169   }
170 }
171