xref: /libCEED/include/ceed/jit-source/hip/hip-gen-templates.h (revision f1c40530bc6f02bc6ca847af718b8b41cc3fd3ca)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP backend macro and type definitions for JiT source
10 #ifndef _ceed_hip_gen_templates_h
11 #define _ceed_hip_gen_templates_h
12 
13 #include <ceed/types.h>
14 
15 
16 //------------------------------------------------------------------------------
17 // Load matrices for basis actions
18 //------------------------------------------------------------------------------
19 template <int P, int Q>
20 inline __device__ void loadMatrix(SharedData_Hip& data, const CeedScalar* d_B, CeedScalar* B) {
21   for (CeedInt i = data.t_id; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z)
22     B[i] = d_B[i];
23 }
24 
25 //------------------------------------------------------------------------------
26 // 1D
27 //------------------------------------------------------------------------------
28 
29 //------------------------------------------------------------------------------
30 // L-vector -> E-vector, offsets provided
31 //------------------------------------------------------------------------------
32 template <int NCOMP, int COMPSTRIDE, int P1d>
33 inline __device__ void readDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
34   if (data.t_id_x < P1d) {
35     const CeedInt node = data.t_id_x;
36     const CeedInt ind = indices[node + elem * P1d];
37     for (CeedInt comp = 0; comp < NCOMP; ++comp)
38       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
39   }
40 }
41 
42 //------------------------------------------------------------------------------
43 // L-vector -> E-vector, strided
44 //------------------------------------------------------------------------------
45 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
46 inline __device__ void readDofsStrided1d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
47   if (data.t_id_x < P1d) {
48     const CeedInt node = data.t_id_x;
49     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
50     for (CeedInt comp = 0; comp < NCOMP; ++comp)
51       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
52   }
53 }
54 
55 //------------------------------------------------------------------------------
56 // E-vector -> L-vector, offsets provided
57 //------------------------------------------------------------------------------
58 template <int NCOMP, int COMPSTRIDE, int P1d>
59 inline __device__ void writeDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
60   if (data.t_id_x < P1d) {
61     const CeedInt node = data.t_id_x;
62     const CeedInt ind = indices[node + elem * P1d];
63     for (CeedInt comp = 0; comp < NCOMP; ++comp)
64       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
65   }
66 }
67 
68 //------------------------------------------------------------------------------
69 // E-vector -> L-vector, strided
70 //------------------------------------------------------------------------------
71 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
72 inline __device__ void writeDofsStrided1d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
73   if (data.t_id_x < P1d) {
74     const CeedInt node = data.t_id_x;
75     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
76     for (CeedInt comp = 0; comp < NCOMP; ++comp)
77       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
78   }
79 }
80 
81 //------------------------------------------------------------------------------
82 // 2D
83 //------------------------------------------------------------------------------
84 
85 //------------------------------------------------------------------------------
86 // L-vector -> E-vector, offsets provided
87 //------------------------------------------------------------------------------
88 template <int NCOMP, int COMPSTRIDE, int P1d>
89 inline __device__ void readDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
90   if (data.t_id_x < P1d && data.t_id_y < P1d) {
91     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
92     const CeedInt ind = indices[node + elem * P1d*P1d];
93     for (CeedInt comp = 0; comp < NCOMP; ++comp)
94       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
95   }
96 }
97 
98 //------------------------------------------------------------------------------
99 // L-vector -> E-vector, strided
100 //------------------------------------------------------------------------------
101 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
102 inline __device__ void readDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
103   if (data.t_id_x < P1d && data.t_id_y < P1d) {
104     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
105     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
106     for (CeedInt comp = 0; comp < NCOMP; ++comp)
107       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
108   }
109 }
110 
111 //------------------------------------------------------------------------------
112 // E-vector -> L-vector, offsets provided
113 //------------------------------------------------------------------------------
114 template <int NCOMP, int COMPSTRIDE, int P1d>
115 inline __device__ void writeDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
116   if (data.t_id_x < P1d && data.t_id_y < P1d) {
117     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
118     const CeedInt ind = indices[node + elem * P1d*P1d];
119     for (CeedInt comp = 0; comp < NCOMP; ++comp)
120       atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]);
121   }
122 }
123 
124 //------------------------------------------------------------------------------
125 // E-vector -> L-vector, strided
126 //------------------------------------------------------------------------------
127 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
128 inline __device__ void writeDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
129   if (data.t_id_x < P1d && data.t_id_y < P1d) {
130     const CeedInt node = data.t_id_x + data.t_id_y*P1d;
131     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
132     for (CeedInt comp = 0; comp < NCOMP; ++comp)
133       d_v[ind + comp * STRIDES_COMP] += r_v[comp];
134   }
135 }
136 
137 //------------------------------------------------------------------------------
138 // 3D
139 //------------------------------------------------------------------------------
140 
141 //------------------------------------------------------------------------------
142 // L-vector -> E-vector, offsets provided
143 //------------------------------------------------------------------------------
144 template <int NCOMP, int COMPSTRIDE, int P1d>
145 inline __device__ void readDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
146   if (data.t_id_x < P1d && data.t_id_y < P1d)
147     for (CeedInt z = 0; z < P1d; ++z) {
148       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
149       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
150       for (CeedInt comp = 0; comp < NCOMP; ++comp)
151         r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp];
152     }
153 }
154 
155 //------------------------------------------------------------------------------
156 // L-vector -> E-vector, strided
157 //------------------------------------------------------------------------------
158 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
159 inline __device__ void readDofsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) {
160   if (data.t_id_x < P1d && data.t_id_y < P1d)
161     for (CeedInt z = 0; z < P1d; ++z) {
162       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
163       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
164       for (CeedInt comp = 0; comp < NCOMP; ++comp)
165         r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP];
166     }
167 }
168 
169 //------------------------------------------------------------------------------
170 // E-vector -> Q-vector, offests provided
171 //------------------------------------------------------------------------------
172 template <int NCOMP, int COMPSTRIDE, int Q1d>
173 inline __device__ void readSliceQuadsOffset3d(SharedData_Hip& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) {
174   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
175     const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d;
176     const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];;
177     for (CeedInt comp = 0; comp < NCOMP; ++comp)
178       r_u[comp] = d_u[ind + COMPSTRIDE * comp];
179   }
180 }
181 
182 //------------------------------------------------------------------------------
183 // E-vector -> Q-vector, strided
184 //------------------------------------------------------------------------------
185 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
186 inline __device__ void readSliceQuadsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) {
187   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
188     const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d;
189     const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
190     for (CeedInt comp = 0; comp < NCOMP; ++comp)
191       r_u[comp] = d_u[ind + comp * STRIDES_COMP];
192   }
193 }
194 
195 //------------------------------------------------------------------------------
196 // E-vector -> L-vector, offsets provided
197 //------------------------------------------------------------------------------
198 template <int NCOMP, int COMPSTRIDE, int P1d>
199 inline __device__ void writeDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) {
200   if (data.t_id_x < P1d && data.t_id_y < P1d)
201     for (CeedInt z = 0; z < P1d; ++z) {
202       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
203       const CeedInt ind = indices[node + elem * P1d*P1d*P1d];
204       for (CeedInt comp = 0; comp < NCOMP; ++comp)
205         atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]);
206     }
207 }
208 
209 //------------------------------------------------------------------------------
210 // E-vector -> L-vector, strided
211 //------------------------------------------------------------------------------
212 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
213 inline __device__ void writeDofsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) {
214   if (data.t_id_x < P1d && data.t_id_y < P1d)
215     for (CeedInt z = 0; z < P1d; ++z) {
216       const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d;
217       const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
218       for (CeedInt comp = 0; comp < NCOMP; ++comp)
219         d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d];
220     }
221 }
222 
223 //------------------------------------------------------------------------------
224 // 3D collocated derivatives computation
225 //------------------------------------------------------------------------------
226 template <int NCOMP, int Q1d>
227 inline __device__ void gradCollo3d(SharedData_Hip& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
228   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
229     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
230       data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[q + comp*Q1d];
231       __syncthreads();
232       // X derivative
233       r_V[comp+0*NCOMP] = 0.0;
234       for (CeedInt i = 0; i < Q1d; ++i)
235         r_V[comp+0*NCOMP] += c_G[i + data.t_id_x*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative)
236       // Y derivative
237       r_V[comp+1*NCOMP] = 0.0;
238       for (CeedInt i = 0; i < Q1d; ++i)
239         r_V[comp+1*NCOMP] += c_G[i + data.t_id_y*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative)
240       // Z derivative
241       r_V[comp+2*NCOMP] = 0.0;
242       for (CeedInt i = 0; i < Q1d; ++i)
243         r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative)
244       __syncthreads();
245     }
246   }
247 }
248 
249 //------------------------------------------------------------------------------
250 // 3D collocated derivatives transpose
251 //------------------------------------------------------------------------------
252 template <int NCOMP, int Q1d>
253 inline __device__ void gradColloTranspose3d(SharedData_Hip& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) {
254   if (data.t_id_x < Q1d && data.t_id_y < Q1d) {
255     for (CeedInt comp = 0; comp < NCOMP; ++comp) {
256       // X derivative
257       data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 0*NCOMP];
258       __syncthreads();
259       for (CeedInt i = 0; i < Q1d; ++i)
260         r_V[q+comp*Q1d] += c_G[data.t_id_x + i*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative)
261       __syncthreads();
262       // Y derivative
263       data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 1*NCOMP];
264       __syncthreads();
265       for (CeedInt i = 0; i < Q1d; ++i)
266         r_V[q+comp*Q1d] += c_G[data.t_id_y + i*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative)
267       __syncthreads();
268       // Z derivative
269       for (CeedInt i = 0; i < Q1d; ++i)
270         r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative)
271     }
272   }
273 }
274 
275 #endif
276