1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP backend macro and type definitions for JiT source 10 #ifndef _ceed_hip_gen_templates_h 11 #define _ceed_hip_gen_templates_h 12 13 #include <ceed/types.h> 14 15 //------------------------------------------------------------------------------ 16 // Load matrices for basis actions 17 //------------------------------------------------------------------------------ 18 template <int P, int Q> 19 inline __device__ void loadMatrix(SharedData_Hip& data, const CeedScalar* d_B, CeedScalar* B) { 20 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 21 } 22 23 //------------------------------------------------------------------------------ 24 // 1D 25 //------------------------------------------------------------------------------ 26 27 //------------------------------------------------------------------------------ 28 // L-vector -> E-vector, offsets provided 29 //------------------------------------------------------------------------------ 30 template <int NCOMP, int COMPSTRIDE, int P1d> 31 inline __device__ void readDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, 32 CeedScalar* r_u) { 33 if (data.t_id_x < P1d) { 34 const CeedInt node = data.t_id_x; 35 const CeedInt ind = indices[node + elem * P1d]; 36 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 37 } 38 } 39 40 //------------------------------------------------------------------------------ 41 // L-vector -> E-vector, strided 42 //------------------------------------------------------------------------------ 43 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 44 inline __device__ void readDofsStrided1d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 45 if (data.t_id_x < P1d) { 46 const CeedInt node = data.t_id_x; 47 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 48 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 49 } 50 } 51 52 //------------------------------------------------------------------------------ 53 // E-vector -> L-vector, offsets provided 54 //------------------------------------------------------------------------------ 55 template <int NCOMP, int COMPSTRIDE, int P1d> 56 inline __device__ void writeDofsOffset1d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, 57 const CeedScalar* r_v, CeedScalar* d_v) { 58 if (data.t_id_x < P1d) { 59 const CeedInt node = data.t_id_x; 60 const CeedInt ind = indices[node + elem * P1d]; 61 for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 62 } 63 } 64 65 //------------------------------------------------------------------------------ 66 // E-vector -> L-vector, strided 67 //------------------------------------------------------------------------------ 68 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 69 inline __device__ void writeDofsStrided1d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 70 if (data.t_id_x < P1d) { 71 const CeedInt node = data.t_id_x; 72 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 73 for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 74 } 75 } 76 77 //------------------------------------------------------------------------------ 78 // 2D 79 //------------------------------------------------------------------------------ 80 81 //------------------------------------------------------------------------------ 82 // L-vector -> E-vector, offsets provided 83 //------------------------------------------------------------------------------ 84 template <int NCOMP, int COMPSTRIDE, int P1d> 85 inline __device__ void readDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, 86 CeedScalar* r_u) { 87 if (data.t_id_x < P1d && data.t_id_y < P1d) { 88 const CeedInt node = data.t_id_x + data.t_id_y * P1d; 89 const CeedInt ind = indices[node + elem * P1d * P1d]; 90 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 91 } 92 } 93 94 //------------------------------------------------------------------------------ 95 // L-vector -> E-vector, strided 96 //------------------------------------------------------------------------------ 97 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 98 inline __device__ void readDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 99 if (data.t_id_x < P1d && data.t_id_y < P1d) { 100 const CeedInt node = data.t_id_x + data.t_id_y * P1d; 101 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 102 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 103 } 104 } 105 106 //------------------------------------------------------------------------------ 107 // E-vector -> L-vector, offsets provided 108 //------------------------------------------------------------------------------ 109 template <int NCOMP, int COMPSTRIDE, int P1d> 110 inline __device__ void writeDofsOffset2d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, 111 const CeedScalar* r_v, CeedScalar* d_v) { 112 if (data.t_id_x < P1d && data.t_id_y < P1d) { 113 const CeedInt node = data.t_id_x + data.t_id_y * P1d; 114 const CeedInt ind = indices[node + elem * P1d * P1d]; 115 for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 116 } 117 } 118 119 //------------------------------------------------------------------------------ 120 // E-vector -> L-vector, strided 121 //------------------------------------------------------------------------------ 122 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 123 inline __device__ void writeDofsStrided2d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 124 if (data.t_id_x < P1d && data.t_id_y < P1d) { 125 const CeedInt node = data.t_id_x + data.t_id_y * P1d; 126 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 127 for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 128 } 129 } 130 131 //------------------------------------------------------------------------------ 132 // 3D 133 //------------------------------------------------------------------------------ 134 135 //------------------------------------------------------------------------------ 136 // L-vector -> E-vector, offsets provided 137 //------------------------------------------------------------------------------ 138 template <int NCOMP, int COMPSTRIDE, int P1d> 139 inline __device__ void readDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, const CeedScalar* d_u, 140 CeedScalar* r_u) { 141 if (data.t_id_x < P1d && data.t_id_y < P1d) 142 for (CeedInt z = 0; z < P1d; ++z) { 143 const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; 144 const CeedInt ind = indices[node + elem * P1d * P1d * P1d]; 145 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z + comp * P1d] = d_u[ind + COMPSTRIDE * comp]; 146 } 147 } 148 149 //------------------------------------------------------------------------------ 150 // L-vector -> E-vector, strided 151 //------------------------------------------------------------------------------ 152 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 153 inline __device__ void readDofsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* d_u, CeedScalar* r_u) { 154 if (data.t_id_x < P1d && data.t_id_y < P1d) 155 for (CeedInt z = 0; z < P1d; ++z) { 156 const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; 157 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 158 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z + comp * P1d] = d_u[ind + comp * STRIDES_COMP]; 159 } 160 } 161 162 //------------------------------------------------------------------------------ 163 // E-vector -> Q-vector, offests provided 164 //------------------------------------------------------------------------------ 165 template <int NCOMP, int COMPSTRIDE, int Q1d> 166 inline __device__ void readSliceQuadsOffset3d(SharedData_Hip& data, const CeedInt nquads, const CeedInt elem, const CeedInt q, const CeedInt* indices, 167 const CeedScalar* d_u, CeedScalar* r_u) { 168 if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 169 const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d; 170 const CeedInt ind = indices[node + elem * Q1d * Q1d * Q1d]; 171 ; 172 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 173 } 174 } 175 176 //------------------------------------------------------------------------------ 177 // E-vector -> Q-vector, strided 178 //------------------------------------------------------------------------------ 179 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 180 inline __device__ void readSliceQuadsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedInt q, const CeedScalar* d_u, CeedScalar* r_u) { 181 if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 182 const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d; 183 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 184 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 185 } 186 } 187 188 //------------------------------------------------------------------------------ 189 // E-vector -> L-vector, offsets provided 190 //------------------------------------------------------------------------------ 191 template <int NCOMP, int COMPSTRIDE, int P1d> 192 inline __device__ void writeDofsOffset3d(SharedData_Hip& data, const CeedInt nnodes, const CeedInt elem, const CeedInt* indices, 193 const CeedScalar* r_v, CeedScalar* d_v) { 194 if (data.t_id_x < P1d && data.t_id_y < P1d) 195 for (CeedInt z = 0; z < P1d; ++z) { 196 const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; 197 const CeedInt ind = indices[node + elem * P1d * P1d * P1d]; 198 for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z + comp * P1d]); 199 } 200 } 201 202 //------------------------------------------------------------------------------ 203 // E-vector -> L-vector, strided 204 //------------------------------------------------------------------------------ 205 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 206 inline __device__ void writeDofsStrided3d(SharedData_Hip& data, const CeedInt elem, const CeedScalar* r_v, CeedScalar* d_v) { 207 if (data.t_id_x < P1d && data.t_id_y < P1d) 208 for (CeedInt z = 0; z < P1d; ++z) { 209 const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; 210 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 211 for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P1d]; 212 } 213 } 214 215 //------------------------------------------------------------------------------ 216 // 3D collocated derivatives computation 217 //------------------------------------------------------------------------------ 218 template <int NCOMP, int Q1d> 219 inline __device__ void gradCollo3d(SharedData_Hip& data, const CeedInt q, const CeedScalar* __restrict__ r_U, const CeedScalar* c_G, 220 CeedScalar* __restrict__ r_V) { 221 if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 222 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 223 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q1d]; 224 __syncthreads(); 225 // X derivative 226 r_V[comp + 0 * NCOMP] = 0.0; 227 for (CeedInt i = 0; i < Q1d; ++i) 228 r_V[comp + 0 * NCOMP] += c_G[i + data.t_id_x * Q1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) 229 // Y derivative 230 r_V[comp + 1 * NCOMP] = 0.0; 231 for (CeedInt i = 0; i < Q1d; ++i) 232 r_V[comp + 1 * NCOMP] += c_G[i + data.t_id_y * Q1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) 233 // Z derivative 234 r_V[comp + 2 * NCOMP] = 0.0; 235 for (CeedInt i = 0; i < Q1d; ++i) r_V[comp + 2 * NCOMP] += c_G[i + q * Q1d] * r_U[i + comp * Q1d]; // Contract z direction (Z derivative) 236 __syncthreads(); 237 } 238 } 239 } 240 241 //------------------------------------------------------------------------------ 242 // 3D collocated derivatives transpose 243 //------------------------------------------------------------------------------ 244 template <int NCOMP, int Q1d> 245 inline __device__ void gradColloTranspose3d(SharedData_Hip& data, const CeedInt q, const CeedScalar* __restrict__ r_U, const CeedScalar* c_G, 246 CeedScalar* __restrict__ r_V) { 247 if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 248 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 249 // X derivative 250 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NCOMP]; 251 __syncthreads(); 252 for (CeedInt i = 0; i < Q1d; ++i) 253 r_V[q + comp * Q1d] += c_G[data.t_id_x + i * Q1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) 254 __syncthreads(); 255 // Y derivative 256 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NCOMP]; 257 __syncthreads(); 258 for (CeedInt i = 0; i < Q1d; ++i) 259 r_V[q + comp * Q1d] += c_G[data.t_id_y + i * Q1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) 260 __syncthreads(); 261 // Z derivative 262 for (CeedInt i = 0; i < Q1d; ++i) 263 r_V[i + comp * Q1d] += c_G[i + q * Q1d] * r_U[comp + 2 * NCOMP]; // PARTIAL contract z direction (Z derivative) 264 } 265 } 266 } 267 268 #endif 269