xref: /libCEED/include/ceed/jit-source/hip/hip-gen-templates.h (revision 363aefef7a03bbe8d88d10545fcfcb59e8ef47d0)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP backend macro and type definitions for JiT source
10 #ifndef CEED_HIP_GEN_TEMPLATES_H
11 #define CEED_HIP_GEN_TEMPLATES_H
12 
13 #include <ceed/types.h>
14 
15 //------------------------------------------------------------------------------
16 // Load matrices for basis actions
17 //------------------------------------------------------------------------------
18 template <int P, int Q>
19 inline __device__ void loadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) {
20   for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i];
21 }
22 
23 //------------------------------------------------------------------------------
24 // 1D
25 //------------------------------------------------------------------------------
26 
27 //------------------------------------------------------------------------------
28 // L-vector -> E-vector, offsets provided
29 //------------------------------------------------------------------------------
30 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
31 inline __device__ void readDofsOffset1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
32                                         const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
33   if (data.t_id_x < P_1d) {
34     const CeedInt node = data.t_id_x;
35     const CeedInt ind  = indices[node + elem * P_1d];
36 
37     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
38   }
39 }
40 
41 //------------------------------------------------------------------------------
42 // L-vector -> E-vector, strided
43 //------------------------------------------------------------------------------
44 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
45 inline __device__ void readDofsStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
46   if (data.t_id_x < P_1d) {
47     const CeedInt node = data.t_id_x;
48     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
49 
50     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
51   }
52 }
53 
54 //------------------------------------------------------------------------------
55 // E-vector -> L-vector, offsets provided
56 //------------------------------------------------------------------------------
57 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
58 inline __device__ void writeDofsOffset1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
59                                          const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
60   if (data.t_id_x < P_1d) {
61     const CeedInt node = data.t_id_x;
62     const CeedInt ind  = indices[node + elem * P_1d];
63 
64     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
65   }
66 }
67 
68 //------------------------------------------------------------------------------
69 // E-vector -> L-vector, strided
70 //------------------------------------------------------------------------------
71 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
72 inline __device__ void writeDofsStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
73                                           CeedScalar *__restrict__ d_v) {
74   if (data.t_id_x < P_1d) {
75     const CeedInt node = data.t_id_x;
76     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
77 
78     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
79   }
80 }
81 
82 //------------------------------------------------------------------------------
83 // 2D
84 //------------------------------------------------------------------------------
85 
86 //------------------------------------------------------------------------------
87 // L-vector -> E-vector, offsets provided
88 //------------------------------------------------------------------------------
89 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
90 inline __device__ void readDofsOffset2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
91                                         const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
92   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
93     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
94     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
95 
96     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
97   }
98 }
99 
100 //------------------------------------------------------------------------------
101 // L-vector -> E-vector, strided
102 //------------------------------------------------------------------------------
103 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
104 inline __device__ void readDofsStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
105   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
106     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
107     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
108 
109     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
110   }
111 }
112 
113 //------------------------------------------------------------------------------
114 // E-vector -> L-vector, offsets provided
115 //------------------------------------------------------------------------------
116 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
117 inline __device__ void writeDofsOffset2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
118                                          const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
119   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
120     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
121     const CeedInt ind  = indices[node + elem * P_1d * P_1d];
122 
123     for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]);
124   }
125 }
126 
127 //------------------------------------------------------------------------------
128 // E-vector -> L-vector, strided
129 //------------------------------------------------------------------------------
130 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
131 inline __device__ void writeDofsStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
132                                           CeedScalar *__restrict__ d_v) {
133   if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
134     const CeedInt node = data.t_id_x + data.t_id_y * P_1d;
135     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
136 
137     for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp];
138   }
139 }
140 
141 //------------------------------------------------------------------------------
142 // 3D
143 //------------------------------------------------------------------------------
144 
145 //------------------------------------------------------------------------------
146 // L-vector -> E-vector, offsets provided
147 //------------------------------------------------------------------------------
148 // TODO: remove "Dofs" and "Quads" in the following function names?
149 //   - readDofsOffset3d -> readOffset3d ?
150 //   - readDofsStrided3d -> readStrided3d ?
151 //   - readSliceQuadsOffset3d -> readSliceOffset3d ?
152 //   - readSliceQuadsStrided3d -> readSliceStrided3d ?
153 //   - writeDofsOffset3d -> writeOffset3d ?
154 //   - writeDofsStrided3d -> writeStrided3d ?
155 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
156 inline __device__ void readDofsOffset3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
157                                         const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
158   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
159     for (CeedInt z = 0; z < P_1d; z++) {
160       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
161       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
162 
163       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
164     }
165 }
166 
167 //------------------------------------------------------------------------------
168 // L-vector -> E-vector, strided
169 //------------------------------------------------------------------------------
170 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
171 inline __device__ void readDofsStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
172   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
173     for (CeedInt z = 0; z < P_1d; z++) {
174       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
175       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
176 
177       for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
178     }
179 }
180 
181 //------------------------------------------------------------------------------
182 // E-vector -> Q-vector, offests provided
183 //------------------------------------------------------------------------------
184 template <int NUM_COMP, int COMP_STRIDE, int Q_1d>
185 inline __device__ void readSliceQuadsOffset3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q,
186                                               const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
187   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
188     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
189     const CeedInt ind  = indices[node + elem * Q_1d * Q_1d * Q_1d];
190 
191     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp];
192   }
193 }
194 
195 //------------------------------------------------------------------------------
196 // E-vector -> Q-vector, strided
197 //------------------------------------------------------------------------------
198 template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
199 inline __device__ void readSliceQuadsStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u,
200                                                CeedScalar *__restrict__ r_u) {
201   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
202     const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d;
203     const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
204 
205     for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP];
206   }
207 }
208 
209 //------------------------------------------------------------------------------
210 // E-vector -> L-vector, offsets provided
211 //------------------------------------------------------------------------------
212 template <int NUM_COMP, int COMP_STRIDE, int P_1d>
213 inline __device__ void writeDofsOffset3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
214                                          const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
215   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
216     for (CeedInt z = 0; z < P_1d; z++) {
217       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
218       const CeedInt ind  = indices[node + elem * P_1d * P_1d * P_1d];
219 
220       for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
221     }
222 }
223 
224 //------------------------------------------------------------------------------
225 // E-vector -> L-vector, strided
226 //------------------------------------------------------------------------------
227 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
228 inline __device__ void writeDofsStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
229                                           CeedScalar *__restrict__ d_v) {
230   if (data.t_id_x < P_1d && data.t_id_y < P_1d)
231     for (CeedInt z = 0; z < P_1d; z++) {
232       const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
233       const CeedInt ind  = node * STRIDES_NODE + elem * STRIDES_ELEM;
234 
235       for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
236     }
237 }
238 
239 //------------------------------------------------------------------------------
240 // 3D collocated derivatives computation
241 //------------------------------------------------------------------------------
242 template <int NUM_COMP, int Q_1d>
243 inline __device__ void gradCollo3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
244                                    CeedScalar *__restrict__ r_V) {
245   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
246     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
247       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d];
248       __syncthreads();
249       // X derivative
250       r_V[comp + 0 * NUM_COMP] = 0.0;
251       for (CeedInt i = 0; i < Q_1d; i++)
252         r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
253       // Y derivative
254       r_V[comp + 1 * NUM_COMP] = 0.0;
255       for (CeedInt i = 0; i < Q_1d; i++)
256         r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
257       // Z derivative
258       r_V[comp + 2 * NUM_COMP] = 0.0;
259       for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];  // Contract z direction (Z derivative)
260       __syncthreads();
261     }
262   }
263 }
264 
265 //------------------------------------------------------------------------------
266 // 3D collocated derivatives transpose
267 //------------------------------------------------------------------------------
268 template <int NUM_COMP, int Q_1d>
269 inline __device__ void gradColloTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G,
270                                             CeedScalar *__restrict__ r_V) {
271   if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
272     for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
273       // X derivative
274       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
275       __syncthreads();
276       for (CeedInt i = 0; i < Q_1d; i++)
277         r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];  // Contract x direction (X derivative)
278       __syncthreads();
279       // Y derivative
280       data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
281       __syncthreads();
282       for (CeedInt i = 0; i < Q_1d; i++)
283         r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];  // Contract y direction (Y derivative)
284       __syncthreads();
285       // Z derivative
286       for (CeedInt i = 0; i < Q_1d; i++)
287         r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];  // PARTIAL contract z direction (Z derivative)
288     }
289   }
290 }
291 
292 #endif  // CEED_HIP_GEN_TEMPLATES_H
293