1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP backend macro and type definitions for JiT source 10 #ifndef _ceed_hip_gen_templates_h 11 #define _ceed_hip_gen_templates_h 12 13 #include <ceed/types.h> 14 15 16 //------------------------------------------------------------------------------ 17 // Load matrices for basis actions 18 //------------------------------------------------------------------------------ 19 template <int P, int Q> 20 inline __device__ void loadMatrix(SharedData_Hip& data, const CeedScalar* d_B, CeedScalar* B) { 21 for (CeedInt i = data.t_id; i < P*Q; i += blockDim.x*blockDim.y*blockDim.z) 22 B[i] = d_B[i]; 23 } 24 25 //------------------------------------------------------------------------------ 26 // 1D 27 //------------------------------------------------------------------------------ 28 29 //------------------------------------------------------------------------------ 30 // L-vector -> E-vector, offsets provided 31 //------------------------------------------------------------------------------ 32 template <int NCOMP, int COMPSTRIDE, int P1d> 33 inline __device__ void readDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 34 if (data.t_id_x < P1d) { 35 const CeedInt node = data.t_id_x; 36 const CeedInt ind = indices[node + elem * P1d]; 37 for (CeedInt comp = 0; comp < NCOMP; ++comp) 38 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 39 } 40 } 41 42 //------------------------------------------------------------------------------ 43 // L-vector -> E-vector, strided 44 //------------------------------------------------------------------------------ 45 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 46 inline __device__ void readDofsStrided1d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 47 if (data.t_id_x < P1d) { 48 const CeedInt node = data.t_id_x; 49 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 50 for (CeedInt comp = 0; comp < NCOMP; ++comp) 51 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 52 } 53 } 54 55 //------------------------------------------------------------------------------ 56 // E-vector -> L-vector, offsets provided 57 //------------------------------------------------------------------------------ 58 template <int NCOMP, int COMPSTRIDE, int P1d> 59 inline __device__ void writeDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 60 if (data.t_id_x < P1d) { 61 const CeedInt node = data.t_id_x; 62 const CeedInt ind = indices[node + elem * P1d]; 63 for (CeedInt comp = 0; comp < NCOMP; ++comp) 64 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 65 } 66 } 67 68 //------------------------------------------------------------------------------ 69 // E-vector -> L-vector, strided 70 //------------------------------------------------------------------------------ 71 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 72 inline __device__ void writeDofsStrided1d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 73 if (data.t_id_x < P1d) { 74 const CeedInt node = data.t_id_x; 75 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 76 for (CeedInt comp = 0; comp < NCOMP; ++comp) 77 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 78 } 79 } 80 81 //------------------------------------------------------------------------------ 82 // 2D 83 //------------------------------------------------------------------------------ 84 85 //------------------------------------------------------------------------------ 86 // L-vector -> E-vector, offsets provided 87 //------------------------------------------------------------------------------ 88 template <int NCOMP, int COMPSTRIDE, int P1d> 89 inline __device__ void readDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 90 if (data.t_id_x < P1d && data.t_id_y < P1d) { 91 const CeedInt node = data.t_id_x + data.t_id_y*P1d; 92 const CeedInt ind = indices[node + elem * P1d*P1d]; 93 for (CeedInt comp = 0; comp < NCOMP; ++comp) 94 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 95 } 96 } 97 98 //------------------------------------------------------------------------------ 99 // L-vector -> E-vector, strided 100 //------------------------------------------------------------------------------ 101 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 102 inline __device__ void readDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 103 if (data.t_id_x < P1d && data.t_id_y < P1d) { 104 const CeedInt node = data.t_id_x + data.t_id_y*P1d; 105 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 106 for (CeedInt comp = 0; comp < NCOMP; ++comp) 107 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 108 } 109 } 110 111 //------------------------------------------------------------------------------ 112 // E-vector -> L-vector, offsets provided 113 //------------------------------------------------------------------------------ 114 template <int NCOMP, int COMPSTRIDE, int P1d> 115 inline __device__ void writeDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 116 if (data.t_id_x < P1d && data.t_id_y < P1d) { 117 const CeedInt node = data.t_id_x + data.t_id_y*P1d; 118 const CeedInt ind = indices[node + elem * P1d*P1d]; 119 for (CeedInt comp = 0; comp < NCOMP; ++comp) 120 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 121 } 122 } 123 124 //------------------------------------------------------------------------------ 125 // E-vector -> L-vector, strided 126 //------------------------------------------------------------------------------ 127 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 128 inline __device__ void writeDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 129 if (data.t_id_x < P1d && data.t_id_y < P1d) { 130 const CeedInt node = data.t_id_x + data.t_id_y*P1d; 131 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 132 for (CeedInt comp = 0; comp < NCOMP; ++comp) 133 d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 134 } 135 } 136 137 //------------------------------------------------------------------------------ 138 // 3D 139 //------------------------------------------------------------------------------ 140 141 //------------------------------------------------------------------------------ 142 // L-vector -> E-vector, offsets provided 143 //------------------------------------------------------------------------------ 144 template <int NCOMP, int COMPSTRIDE, int P1d> 145 inline __device__ void readDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 146 if (data.t_id_x < P1d && data.t_id_y < P1d) 147 for (CeedInt z = 0; z < P1d; ++z) { 148 const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; 149 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 150 for (CeedInt comp = 0; comp < NCOMP; ++comp) 151 r_u[z+comp*P1d] = d_u[ind + COMPSTRIDE * comp]; 152 } 153 } 154 155 //------------------------------------------------------------------------------ 156 // L-vector -> E-vector, strided 157 //------------------------------------------------------------------------------ 158 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 159 inline __device__ void readDofsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 160 if (data.t_id_x < P1d && data.t_id_y < P1d) 161 for (CeedInt z = 0; z < P1d; ++z) { 162 const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; 163 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 164 for (CeedInt comp = 0; comp < NCOMP; ++comp) 165 r_u[z+comp*P1d] = d_u[ind + comp * STRIDES_COMP]; 166 } 167 } 168 169 //------------------------------------------------------------------------------ 170 // E-vector -> Q-vector, offests provided 171 //------------------------------------------------------------------------------ 172 template <int NCOMP, int COMPSTRIDE, int Q1d> 173 inline __device__ void readSliceQuadsOffset3d(SharedData_Hip& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, const CeedScalar* d_u, CeedScalar* r_u) { 174 if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 175 const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d; 176 const CeedInt ind = indices[node + elem * Q1d*Q1d*Q1d];; 177 for (CeedInt comp = 0; comp < NCOMP; ++comp) 178 r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 179 } 180 } 181 182 //------------------------------------------------------------------------------ 183 // E-vector -> Q-vector, strided 184 //------------------------------------------------------------------------------ 185 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 186 inline __device__ void readSliceQuadsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 187 if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 188 const CeedInt node = data.t_id_x + data.t_id_y*Q1d + q*Q1d*Q1d; 189 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 190 for (CeedInt comp = 0; comp < NCOMP; ++comp) 191 r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 192 } 193 } 194 195 //------------------------------------------------------------------------------ 196 // E-vector -> L-vector, offsets provided 197 //------------------------------------------------------------------------------ 198 template <int NCOMP, int COMPSTRIDE, int P1d> 199 inline __device__ void writeDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* r_v, CeedScalar* d_v) { 200 if (data.t_id_x < P1d && data.t_id_y < P1d) 201 for (CeedInt z = 0; z < P1d; ++z) { 202 const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; 203 const CeedInt ind = indices[node + elem * P1d*P1d*P1d]; 204 for (CeedInt comp = 0; comp < NCOMP; ++comp) 205 atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z+comp*P1d]); 206 } 207 } 208 209 //------------------------------------------------------------------------------ 210 // E-vector -> L-vector, strided 211 //------------------------------------------------------------------------------ 212 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 213 inline __device__ void writeDofsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 214 if (data.t_id_x < P1d && data.t_id_y < P1d) 215 for (CeedInt z = 0; z < P1d; ++z) { 216 const CeedInt node = data.t_id_x + data.t_id_y*P1d + z*P1d*P1d; 217 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 218 for (CeedInt comp = 0; comp < NCOMP; ++comp) 219 d_v[ind + comp * STRIDES_COMP] += r_v[z+comp*P1d]; 220 } 221 } 222 223 //------------------------------------------------------------------------------ 224 // 3D collocated derivatives computation 225 //------------------------------------------------------------------------------ 226 template <int NCOMP, int Q1d> 227 inline __device__ void gradCollo3d(SharedData_Hip& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 228 if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 229 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 230 data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[q + comp*Q1d]; 231 __syncthreads(); 232 // X derivative 233 r_V[comp+0*NCOMP] = 0.0; 234 for (CeedInt i = 0; i < Q1d; ++i) 235 r_V[comp+0*NCOMP] += c_G[i + data.t_id_x*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative) 236 // Y derivative 237 r_V[comp+1*NCOMP] = 0.0; 238 for (CeedInt i = 0; i < Q1d; ++i) 239 r_V[comp+1*NCOMP] += c_G[i + data.t_id_y*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative) 240 // Z derivative 241 r_V[comp+2*NCOMP] = 0.0; 242 for (CeedInt i = 0; i < Q1d; ++i) 243 r_V[comp+2*NCOMP] += c_G[i + q*Q1d] * r_U[i + comp*Q1d]; // Contract z direction (Z derivative) 244 __syncthreads(); 245 } 246 } 247 } 248 249 //------------------------------------------------------------------------------ 250 // 3D collocated derivatives transpose 251 //------------------------------------------------------------------------------ 252 template <int NCOMP, int Q1d> 253 inline __device__ void gradColloTranspose3d(SharedData_Hip& data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, CeedScalar *__restrict__ r_V) { 254 if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 255 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 256 // X derivative 257 data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 0*NCOMP]; 258 __syncthreads(); 259 for (CeedInt i = 0; i < Q1d; ++i) 260 r_V[q+comp*Q1d] += c_G[data.t_id_x + i*Q1d] * data.slice[i + data.t_id_y*T_1D]; // Contract x direction (X derivative) 261 __syncthreads(); 262 // Y derivative 263 data.slice[data.t_id_x + data.t_id_y*T_1D] = r_U[comp + 1*NCOMP]; 264 __syncthreads(); 265 for (CeedInt i = 0; i < Q1d; ++i) 266 r_V[q+comp*Q1d] += c_G[data.t_id_y + i*Q1d] * data.slice[data.t_id_x + i*T_1D]; // Contract y direction (Y derivative) 267 __syncthreads(); 268 // Z derivative 269 for (CeedInt i = 0; i < Q1d; ++i) 270 r_V[i+comp*Q1d] += c_G[i + q*Q1d] * r_U[comp + 2*NCOMP]; // PARTIAL contract z direction (Z derivative) 271 } 272 } 273 } 274 275 #endif 276