1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA backend macro and type definitions for JiT source 10 #ifndef CEED_CUDA_GEN_TEMPLATES_H 11 #define CEED_CUDA_GEN_TEMPLATES_H 12 13 #include <ceed/types.h> 14 15 //------------------------------------------------------------------------------ 16 // Load matrices for basis actions 17 //------------------------------------------------------------------------------ 18 template <int P, int Q> 19 inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 20 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 21 } 22 23 //------------------------------------------------------------------------------ 24 // 1D 25 //------------------------------------------------------------------------------ 26 27 //------------------------------------------------------------------------------ 28 // L-vector -> E-vector, offsets provided 29 //------------------------------------------------------------------------------ 30 template <int NUM_COMP, int COMP_STRIDE, int P_1d> 31 inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 32 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 33 if (data.t_id_x < P_1d) { 34 const CeedInt node = data.t_id_x; 35 const CeedInt ind = indices[node + elem * P_1d]; 36 37 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 38 } 39 } 40 41 //------------------------------------------------------------------------------ 42 // L-vector -> E-vector, strided 43 //------------------------------------------------------------------------------ 44 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 45 inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 46 CeedScalar *__restrict__ r_u) { 47 if (data.t_id_x < P_1d) { 48 const CeedInt node = data.t_id_x; 49 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 50 51 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 52 } 53 } 54 55 //------------------------------------------------------------------------------ 56 // E-vector -> L-vector, offsets provided 57 //------------------------------------------------------------------------------ 58 template <int NUM_COMP, int COMP_STRIDE, int P_1d> 59 inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 60 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 61 if (data.t_id_x < P_1d) { 62 const CeedInt node = data.t_id_x; 63 const CeedInt ind = indices[node + elem * P_1d]; 64 65 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 66 } 67 } 68 69 //------------------------------------------------------------------------------ 70 // E-vector -> L-vector, strided 71 //------------------------------------------------------------------------------ 72 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 73 inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 74 CeedScalar *__restrict__ d_v) { 75 if (data.t_id_x < P_1d) { 76 const CeedInt node = data.t_id_x; 77 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 78 79 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 80 } 81 } 82 83 //------------------------------------------------------------------------------ 84 // 2D 85 //------------------------------------------------------------------------------ 86 87 //------------------------------------------------------------------------------ 88 // L-vector -> E-vector, offsets provided 89 //------------------------------------------------------------------------------ 90 template <int NUM_COMP, int COMP_STRIDE, int P_1d> 91 inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 92 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 93 if (data.t_id_x < P_1d && data.t_id_y < P_1d) { 94 const CeedInt node = data.t_id_x + data.t_id_y * P_1d; 95 const CeedInt ind = indices[node + elem * P_1d * P_1d]; 96 97 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 98 } 99 } 100 101 //------------------------------------------------------------------------------ 102 // L-vector -> E-vector, strided 103 //------------------------------------------------------------------------------ 104 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 105 inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 106 CeedScalar *__restrict__ r_u) { 107 if (data.t_id_x < P_1d && data.t_id_y < P_1d) { 108 const CeedInt node = data.t_id_x + data.t_id_y * P_1d; 109 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 110 111 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 112 } 113 } 114 115 //------------------------------------------------------------------------------ 116 // E-vector -> L-vector, offsets provided 117 //------------------------------------------------------------------------------ 118 template <int NUM_COMP, int COMP_STRIDE, int P_1d> 119 inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 120 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 121 if (data.t_id_x < P_1d && data.t_id_y < P_1d) { 122 const CeedInt node = data.t_id_x + data.t_id_y * P_1d; 123 const CeedInt ind = indices[node + elem * P_1d * P_1d]; 124 125 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 126 } 127 } 128 129 //------------------------------------------------------------------------------ 130 // E-vector -> L-vector, strided 131 //------------------------------------------------------------------------------ 132 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 133 inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 134 CeedScalar *__restrict__ d_v) { 135 if (data.t_id_x < P_1d && data.t_id_y < P_1d) { 136 const CeedInt node = data.t_id_x + data.t_id_y * P_1d; 137 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 138 139 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 140 } 141 } 142 143 //------------------------------------------------------------------------------ 144 // 3D 145 //------------------------------------------------------------------------------ 146 147 //------------------------------------------------------------------------------ 148 // L-vector -> E-vector, offsets provided 149 //------------------------------------------------------------------------------ 150 // TODO: remove "Dofs" and "Quads" in the following function names? 151 // - readDofsOffset3d -> readOffset3d ? 152 // - readDofsStrided3d -> readStrided3d ? 153 // - readSliceQuadsOffset3d -> readSliceOffset3d ? 154 // - readSliceQuadsStrided3d -> readSliceStrided3d ? 155 // - writeDofsOffset3d -> writeOffset3d ? 156 // - writeDofsStrided3d -> writeStrided3d ? 157 template <int NUM_COMP, int COMP_STRIDE, int P_1d> 158 inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 159 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 160 if (data.t_id_x < P_1d && data.t_id_y < P_1d) 161 for (CeedInt z = 0; z < P_1d; z++) { 162 const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; 163 const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d]; 164 165 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp]; 166 } 167 } 168 169 //------------------------------------------------------------------------------ 170 // L-vector -> E-vector, strided 171 //------------------------------------------------------------------------------ 172 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 173 inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 174 CeedScalar *__restrict__ r_u) { 175 if (data.t_id_x < P_1d && data.t_id_y < P_1d) 176 for (CeedInt z = 0; z < P_1d; z++) { 177 const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; 178 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 179 180 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP]; 181 } 182 } 183 184 //------------------------------------------------------------------------------ 185 // E-vector -> Q-vector, offests provided 186 //------------------------------------------------------------------------------ 187 template <int NUM_COMP, int COMP_STRIDE, int Q_1d> 188 inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, 189 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 190 if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { 191 const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d; 192 const CeedInt ind = indices[node + elem * Q_1d * Q_1d * Q_1d]; 193 194 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 195 } 196 } 197 198 //------------------------------------------------------------------------------ 199 // E-vector -> Q-vector, strided 200 //------------------------------------------------------------------------------ 201 template <int NUM_COMP, int Q_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 202 inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, 203 CeedScalar *__restrict__ r_u) { 204 if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { 205 const CeedInt node = data.t_id_x + data.t_id_y * Q_1d + q * Q_1d * Q_1d; 206 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 207 208 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 209 } 210 } 211 212 //------------------------------------------------------------------------------ 213 // E-vector -> L-vector, offsets provided 214 //------------------------------------------------------------------------------ 215 template <int NUM_COMP, int COMP_STRIDE, int P_1d> 216 inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 217 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 218 if (data.t_id_x < P_1d && data.t_id_y < P_1d) 219 for (CeedInt z = 0; z < P_1d; z++) { 220 const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; 221 const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d]; 222 223 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]); 224 } 225 } 226 227 //------------------------------------------------------------------------------ 228 // E-vector -> L-vector, strided 229 //------------------------------------------------------------------------------ 230 template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 231 inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 232 CeedScalar *__restrict__ d_v) { 233 if (data.t_id_x < P_1d && data.t_id_y < P_1d) 234 for (CeedInt z = 0; z < P_1d; z++) { 235 const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; 236 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 237 238 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d]; 239 } 240 } 241 242 //------------------------------------------------------------------------------ 243 // 3D collocated derivatives computation 244 //------------------------------------------------------------------------------ 245 template <int NUM_COMP, int Q_1d> 246 inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 247 CeedScalar *__restrict__ r_V) { 248 if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { 249 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 250 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1d]; 251 __syncthreads(); 252 // X derivative 253 r_V[comp + 0 * NUM_COMP] = 0.0; 254 for (CeedInt i = 0; i < Q_1d; i++) 255 r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) 256 // Y derivative 257 r_V[comp + 1 * NUM_COMP] = 0.0; 258 for (CeedInt i = 0; i < Q_1d; i++) 259 r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) 260 // Z derivative 261 r_V[comp + 2 * NUM_COMP] = 0.0; 262 for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d]; // Contract z direction (Z derivative) 263 __syncthreads(); 264 } 265 } 266 } 267 268 //------------------------------------------------------------------------------ 269 // 3D collocated derivatives transpose 270 //------------------------------------------------------------------------------ 271 template <int NUM_COMP, int Q_1d> 272 inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 273 CeedScalar *__restrict__ r_V) { 274 if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) { 275 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 276 // X derivative 277 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; 278 __syncthreads(); 279 for (CeedInt i = 0; i < Q_1d; i++) 280 r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) 281 __syncthreads(); 282 // Y derivative 283 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; 284 __syncthreads(); 285 for (CeedInt i = 0; i < Q_1d; i++) 286 r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) 287 __syncthreads(); 288 // Z derivative 289 for (CeedInt i = 0; i < Q_1d; i++) 290 r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP]; // PARTIAL contract z direction (Z derivative) 291 } 292 } 293 } 294 295 #endif // CEED_CUDA_GEN_TEMPLATES_H 296