xref: /libCEED/include/ceed/jit-source/hip/hip-shared-basis-tensor.h (revision 77bb289ab1e4b7f6618bd6e1f5afbd06578dc0f7)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 /// @file
9 /// Internal header for HIP shared memory tensor product basis
10 #ifndef CEED_HIP_SHARED_BASIS_TENSOR_H
11 #define CEED_HIP_SHARED_BASIS_TENSOR_H
12 
13 #include <ceed.h>
14 
15 #include "hip-shared-basis-read-write-templates.h"
16 #include "hip-shared-basis-tensor-templates.h"
17 
18 //------------------------------------------------------------------------------
19 // Interp kernel by dim
20 //------------------------------------------------------------------------------
21 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
22     void Interp(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
23   extern __shared__ CeedScalar slice[];
24 
25   // load interp_1d into shared memory
26   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
27   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
28   __syncthreads();
29 
30   SharedData_Hip data;
31   data.t_id_x = threadIdx.x;
32   data.t_id_y = threadIdx.y;
33   data.t_id_z = threadIdx.z;
34   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
35   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
36 
37   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
38   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
39 
40   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
41     if (BASIS_DIM == 1) {
42       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
43       Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
44       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
45     } else if (BASIS_DIM == 2) {
46       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
47       InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
48       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
49     } else if (BASIS_DIM == 3) {
50       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
51                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
52       InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
53       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
54                                                         BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
55     }
56   }
57 }
58 
59 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
60     void InterpTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
61   extern __shared__ CeedScalar slice[];
62 
63   // load interp_1d into shared memory
64   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
65   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
66   __syncthreads();
67 
68   SharedData_Hip data;
69   data.t_id_x = threadIdx.x;
70   data.t_id_y = threadIdx.y;
71   data.t_id_z = threadIdx.z;
72   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
73   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
74 
75   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
76   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
77 
78   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
79     if (BASIS_DIM == 1) {
80       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
81       InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
82       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
83     } else if (BASIS_DIM == 2) {
84       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
85       InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
86       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
87     } else if (BASIS_DIM == 3) {
88       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
89                                                        BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
90       InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V);
91       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
92                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
93     }
94   }
95 }
96 
97 //------------------------------------------------------------------------------
98 // Grad kernel by dim
99 //------------------------------------------------------------------------------
100 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
101     void Grad(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U,
102               CeedScalar *__restrict__ d_V) {
103   extern __shared__ CeedScalar slice[];
104 
105   // load interp_1d and grad_1d into shared memory
106   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
107   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
108   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
109   loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G);
110   __syncthreads();
111 
112   SharedData_Hip data;
113   data.t_id_x = threadIdx.x;
114   data.t_id_y = threadIdx.y;
115   data.t_id_z = threadIdx.z;
116   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
117   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
118 
119   CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
120   CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
121 
122   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
123     if (BASIS_DIM == 1) {
124       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
125       Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
126       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V);
127     } else if (BASIS_DIM == 2) {
128       ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
129       GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
130       WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V,
131                                                                     d_V);
132     } else if (BASIS_DIM == 3) {
133       ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
134                                                        BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
135       if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
136       else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
137       WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
138                                                                     BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V);
139     }
140   }
141 }
142 
143 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__
144     void GradTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U,
145                        CeedScalar *__restrict__ d_V) {
146   extern __shared__ CeedScalar slice[];
147 
148   // load interp_1d and grad_1d into shared memory
149   __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
150   loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B);
151   __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)];
152   loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G);
153   __syncthreads();
154 
155   SharedData_Hip data;
156   data.t_id_x = threadIdx.x;
157   data.t_id_y = threadIdx.y;
158   data.t_id_z = threadIdx.z;
159   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
160   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
161 
162   CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
163   CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
164 
165   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
166     if (BASIS_DIM == 1) {
167       ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U);
168       GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
169       WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
170     } else if (BASIS_DIM == 2) {
171       ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U,
172                                                                    r_U);
173       GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
174       WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
175     } else if (BASIS_DIM == 3) {
176       ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem,
177                                                                    BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U);
178       if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
179       else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V);
180       WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
181                                                         BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
182     }
183   }
184 }
185 
186 //------------------------------------------------------------------------------
187 // Weight kernels by dim
188 //------------------------------------------------------------------------------
189 extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__
190     void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) {
191   extern __shared__ CeedScalar slice[];
192 
193   SharedData_Hip data;
194   data.t_id_x = threadIdx.x;
195   data.t_id_y = threadIdx.y;
196   data.t_id_z = threadIdx.z;
197   data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
198   data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
199 
200   CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1];
201 
202   for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
203     if (BASIS_DIM == 1) {
204       Weight1d<BASIS_Q_1D>(data, q_weight_1d, r_W);
205       WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W);
206     } else if (BASIS_DIM == 2) {
207       WeightTensor2d<BASIS_Q_1D>(data, q_weight_1d, r_W);
208       WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W);
209     } else if (BASIS_DIM == 3) {
210       WeightTensor3d<BASIS_Q_1D>(data, q_weight_1d, r_W);
211       WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W,
212                                            d_W);
213     }
214   }
215 }
216 
217 //------------------------------------------------------------------------------
218 
219 #endif  // CEED_HIP_SHARED_BASIS_TENSOR_H
220