xref: /libCEED/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h (revision f4112a4e6eeebf7fbbab43aa04d8ed6ce8965ebf)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for CUDA shared memory basis read/write templates
10 #include <ceed/types.h>
11 
12 //------------------------------------------------------------------------------
13 // 1D
14 //------------------------------------------------------------------------------
15 
16 //------------------------------------------------------------------------------
17 // E-vector -> single element
18 //------------------------------------------------------------------------------
19 template <int NUM_COMP, int P_1D>
20 inline __device__ void ReadElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
21                                             const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
22   if (data.t_id_x < P_1D) {
23     const CeedInt node = data.t_id_x;
24     const CeedInt ind  = node * strides_node + elem * strides_elem;
25 
26     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
27       r_u[comp] = d_u[ind + comp * strides_comp];
28     }
29   }
30 }
31 
32 //------------------------------------------------------------------------------
33 // Single element -> E-vector
34 //------------------------------------------------------------------------------
35 template <int NUM_COMP, int P_1D>
36 inline __device__ void WriteElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
37                                              const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
38   if (data.t_id_x < P_1D) {
39     const CeedInt node = data.t_id_x;
40     const CeedInt ind  = node * strides_node + elem * strides_elem;
41 
42     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
43       d_v[ind + comp * strides_comp] = r_v[comp];
44     }
45   }
46 }
47 
48 template <int NUM_COMP, int P_1D>
49 inline __device__ void SumElementStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
50                                            const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
51   if (data.t_id_x < P_1D) {
52     const CeedInt node = data.t_id_x;
53     const CeedInt ind  = node * strides_node + elem * strides_elem;
54 
55     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
56       d_v[ind + comp * strides_comp] += r_v[comp];
57     }
58   }
59 }
60 
61 //------------------------------------------------------------------------------
62 // 2D
63 //------------------------------------------------------------------------------
64 
65 //------------------------------------------------------------------------------
66 // E-vector -> single element
67 //------------------------------------------------------------------------------
68 template <int NUM_COMP, int P_1D>
69 inline __device__ void ReadElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
70                                             const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
71   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
72     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
73     const CeedInt ind  = node * strides_node + elem * strides_elem;
74 
75     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
76       r_u[comp] = d_u[ind + comp * strides_comp];
77     }
78   }
79 }
80 
81 //------------------------------------------------------------------------------
82 // Single element -> E-vector
83 //------------------------------------------------------------------------------
84 template <int NUM_COMP, int P_1D>
85 inline __device__ void WriteElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
86                                              const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
87   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
88     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
89     const CeedInt ind  = node * strides_node + elem * strides_elem;
90 
91     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
92       d_v[ind + comp * strides_comp] = r_v[comp];
93     }
94   }
95 }
96 
97 template <int NUM_COMP, int P_1D>
98 inline __device__ void SumElementStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
99                                            const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
100   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
101     const CeedInt node = data.t_id_x + data.t_id_y * P_1D;
102     const CeedInt ind  = node * strides_node + elem * strides_elem;
103 
104     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
105       d_v[ind + comp * strides_comp] += r_v[comp];
106     }
107   }
108 }
109 
110 //------------------------------------------------------------------------------
111 // 3D
112 //------------------------------------------------------------------------------
113 
114 //------------------------------------------------------------------------------
115 // E-vector -> single element
116 //------------------------------------------------------------------------------
117 template <int NUM_COMP, int P_1D>
118 inline __device__ void ReadElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
119                                             const CeedInt strides_elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
120   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
121     for (CeedInt z = 0; z < P_1D; z++) {
122       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
123       const CeedInt ind  = node * strides_node + elem * strides_elem;
124 
125       for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
126         r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp];
127       }
128     }
129   }
130 }
131 
132 //------------------------------------------------------------------------------
133 // Single element -> E-vector
134 //------------------------------------------------------------------------------
135 template <int NUM_COMP, int P_1D>
136 inline __device__ void WriteElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
137                                              const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
138   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
139     for (CeedInt z = 0; z < P_1D; z++) {
140       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
141       const CeedInt ind  = node * strides_node + elem * strides_elem;
142 
143       for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
144         d_v[ind + comp * strides_comp] = r_v[z + comp * P_1D];
145       }
146     }
147   }
148 }
149 
150 template <int NUM_COMP, int P_1D>
151 inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt strides_node, const CeedInt strides_comp,
152                                            const CeedInt strides_elem, const CeedScalar *r_v, CeedScalar *d_v) {
153   if (data.t_id_x < P_1D && data.t_id_y < P_1D) {
154     for (CeedInt z = 0; z < P_1D; z++) {
155       const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D;
156       const CeedInt ind  = node * strides_node + elem * strides_elem;
157 
158       for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
159         d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D];
160       }
161     }
162   }
163 }
164 
165 //------------------------------------------------------------------------------
166 // AtPoints
167 //------------------------------------------------------------------------------
168 
169 //------------------------------------------------------------------------------
170 // E-vector -> single point
171 //------------------------------------------------------------------------------
172 template <int NUM_COMP, int NUM_PTS>
173 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
174                                  const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem,
175                                  const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
176   const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
177 
178   if (p < points_in_elem) {
179     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
180       r_u[comp] = d_u[ind + comp * strides_comp];
181     }
182   } else {
183     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
184       r_u[comp] = 0.0;
185     }
186   }
187 }
188 
189 //------------------------------------------------------------------------------
190 // Single point -> E-vector
191 //------------------------------------------------------------------------------
192 template <int NUM_COMP, int NUM_PTS>
193 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
194                                   const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v,
195                                   CeedScalar *d_v) {
196   if (p < points_in_elem) {
197     const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;
198 
199     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
200       d_v[ind + comp * strides_comp] = r_v[comp];
201     }
202   }
203 }
204