1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP backend macro and type definitions for JiT source 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Load matrices for basis actions 14 //------------------------------------------------------------------------------ 15 template <int P, int Q> 16 inline __device__ void LoadMatrix(SharedData_Hip &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 18 } 19 20 //------------------------------------------------------------------------------ 21 // 1D 22 //------------------------------------------------------------------------------ 23 24 //------------------------------------------------------------------------------ 25 // L-vector -> E-vector, offsets provided 26 //------------------------------------------------------------------------------ 27 template <int NUM_COMP, int COMP_STRIDE, int P_1d> 28 inline __device__ void ReadLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 29 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 30 if (data.t_id_x < P_1d) { 31 const CeedInt node = data.t_id_x; 32 const CeedInt ind = indices[node + elem * P_1d]; 33 34 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 35 } 36 } 37 38 //------------------------------------------------------------------------------ 39 // L-vector -> E-vector, strided 40 //------------------------------------------------------------------------------ 41 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 42 inline __device__ void ReadLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 43 if (data.t_id_x < P_1d) { 44 const CeedInt node = data.t_id_x; 45 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 46 47 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 48 } 49 } 50 51 //------------------------------------------------------------------------------ 52 // E-vector -> L-vector, offsets provided 53 //------------------------------------------------------------------------------ 54 template <int NUM_COMP, int COMP_STRIDE, int P_1d> 55 inline __device__ void WriteLVecStandard1d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 56 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 57 if (data.t_id_x < P_1d) { 58 const CeedInt node = data.t_id_x; 59 const CeedInt ind = indices[node + elem * P_1d]; 60 61 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 62 } 63 } 64 65 //------------------------------------------------------------------------------ 66 // E-vector -> L-vector, strided 67 //------------------------------------------------------------------------------ 68 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 69 inline __device__ void WriteLVecStrided1d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 70 CeedScalar *__restrict__ d_v) { 71 if (data.t_id_x < P_1d) { 72 const CeedInt node = data.t_id_x; 73 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 74 75 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 76 } 77 } 78 79 //------------------------------------------------------------------------------ 80 // 2D 81 //------------------------------------------------------------------------------ 82 83 //------------------------------------------------------------------------------ 84 // L-vector -> E-vector, offsets provided 85 //------------------------------------------------------------------------------ 86 template <int NUM_COMP, int COMP_STRIDE, int P_1d> 87 inline __device__ void ReadLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 88 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 89 if (data.t_id_x < P_1d && data.t_id_y < P_1d) { 90 const CeedInt node = data.t_id_x + data.t_id_y * P_1d; 91 const CeedInt ind = indices[node + elem * P_1d * P_1d]; 92 93 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 94 } 95 } 96 97 //------------------------------------------------------------------------------ 98 // L-vector -> E-vector, strided 99 //------------------------------------------------------------------------------ 100 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 101 inline __device__ void ReadLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 102 if (data.t_id_x < P_1d && data.t_id_y < P_1d) { 103 const CeedInt node = data.t_id_x + data.t_id_y * P_1d; 104 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 105 106 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 107 } 108 } 109 110 //------------------------------------------------------------------------------ 111 // E-vector -> L-vector, offsets provided 112 //------------------------------------------------------------------------------ 113 template <int NUM_COMP, int COMP_STRIDE, int P_1d> 114 inline __device__ void WriteLVecStandard2d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 115 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 116 if (data.t_id_x < P_1d && data.t_id_y < P_1d) { 117 const CeedInt node = data.t_id_x + data.t_id_y * P_1d; 118 const CeedInt ind = indices[node + elem * P_1d * P_1d]; 119 120 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 121 } 122 } 123 124 //------------------------------------------------------------------------------ 125 // E-vector -> L-vector, strided 126 //------------------------------------------------------------------------------ 127 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 128 inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 129 CeedScalar *__restrict__ d_v) { 130 if (data.t_id_x < P_1d && data.t_id_y < P_1d) { 131 const CeedInt node = data.t_id_x + data.t_id_y * P_1d; 132 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 133 134 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 135 } 136 } 137 138 //------------------------------------------------------------------------------ 139 // 3D 140 //------------------------------------------------------------------------------ 141 142 //------------------------------------------------------------------------------ 143 // L-vector -> E-vector, offsets provided 144 //------------------------------------------------------------------------------ 145 template <int NUM_COMP, int COMP_STRIDE, int P_1d> 146 inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 147 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 148 if (data.t_id_x < P_1d && data.t_id_y < P_1d) 149 for (CeedInt z = 0; z < P_1d; z++) { 150 const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; 151 const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d]; 152 153 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp]; 154 } 155 } 156 157 //------------------------------------------------------------------------------ 158 // L-vector -> E-vector, strided 159 //------------------------------------------------------------------------------ 160 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 161 inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 162 if (data.t_id_x < P_1d && data.t_id_y < P_1d) 163 for (CeedInt z = 0; z < P_1d; z++) { 164 const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; 165 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 166 167 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP]; 168 } 169 } 170 171 //------------------------------------------------------------------------------ 172 // E-vector -> Q-vector, offests provided 173 //------------------------------------------------------------------------------ 174 template <int NUM_COMP, int COMP_STRIDE, int Q_1d> 175 inline __device__ void ReadEVecSliceStandard3d(SharedData_Hip &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, 176 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, 177 CeedScalar *__restrict__ r_u) { 178 if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { 179 const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d; 180 const CeedInt ind = indices[node + elem * Q_1d * Q_1d * Q_1d]; 181 182 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 183 } 184 } 185 186 //------------------------------------------------------------------------------ 187 // E-vector -> Q-vector, strided 188 //------------------------------------------------------------------------------ 189 template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 190 inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, 191 CeedScalar *__restrict__ r_u) { 192 if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { 193 const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d; 194 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 195 196 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 197 } 198 } 199 200 //------------------------------------------------------------------------------ 201 // E-vector -> L-vector, offsets provided 202 //------------------------------------------------------------------------------ 203 template <int NUM_COMP, int COMP_STRIDE, int P_1d> 204 inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 205 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 206 if (data.t_id_x < P_1d && data.t_id_y < P_1d) 207 for (CeedInt z = 0; z < P_1d; z++) { 208 const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; 209 const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d]; 210 211 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]); 212 } 213 } 214 215 //------------------------------------------------------------------------------ 216 // E-vector -> L-vector, strided 217 //------------------------------------------------------------------------------ 218 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 219 inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 220 CeedScalar *__restrict__ d_v) { 221 if (data.t_id_x < P_1d && data.t_id_y < P_1d) 222 for (CeedInt z = 0; z < P_1d; z++) { 223 const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; 224 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 225 226 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d]; 227 } 228 } 229 230 //------------------------------------------------------------------------------ 231 // 3D collocated derivatives computation 232 //------------------------------------------------------------------------------ 233 template <int NUM_COMP, int Q_1d> 234 inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 235 CeedScalar *__restrict__ r_V) { 236 if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { 237 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 238 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d]; 239 __syncthreads(); 240 // X derivative 241 r_V[comp + 0 * NUM_COMP] = 0.0; 242 for (CeedInt i = 0; i < Q_1d; i++) 243 r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) 244 // Y derivative 245 r_V[comp + 1 * NUM_COMP] = 0.0; 246 for (CeedInt i = 0; i < Q_1d; i++) 247 r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) 248 // Z derivative 249 r_V[comp + 2 * NUM_COMP] = 0.0; 250 for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d]; // Contract z direction (Z derivative) 251 __syncthreads(); 252 } 253 } 254 } 255 256 //------------------------------------------------------------------------------ 257 // 3D collocated derivatives transpose 258 //------------------------------------------------------------------------------ 259 template <int NUM_COMP, int Q_1d> 260 inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 261 CeedScalar *__restrict__ r_V) { 262 if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { 263 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 264 // X derivative 265 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; 266 __syncthreads(); 267 for (CeedInt i = 0; i < Q_1d; i++) 268 r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) 269 __syncthreads(); 270 // Y derivative 271 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; 272 __syncthreads(); 273 for (CeedInt i = 0; i < Q_1d; i++) 274 r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) 275 __syncthreads(); 276 // Z derivative 277 for (CeedInt i = 0; i < Q_1d; i++) 278 r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP]; // PARTIAL contract z direction (Z derivative) 279 } 280 } 281 } 282