1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA backend macro and type definitions for JiT source 10 #include <ceed/types.h> 11 12 //------------------------------------------------------------------------------ 13 // Load matrices for basis actions 14 //------------------------------------------------------------------------------ 15 template <int P, int Q> 16 inline __device__ void LoadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 18 } 19 20 //------------------------------------------------------------------------------ 21 // AtPoints 22 //------------------------------------------------------------------------------ 23 24 //------------------------------------------------------------------------------ 25 // L-vector -> single point 26 //------------------------------------------------------------------------------ 27 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 28 inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 29 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 30 const CeedInt ind = indices[p + elem * NUM_PTS]; 31 32 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 33 r_u[comp] = d_u[ind + comp * COMP_STRIDE]; 34 } 35 } 36 37 //------------------------------------------------------------------------------ 38 // Single point -> L-vector 39 //------------------------------------------------------------------------------ 40 template <int NUM_COMP, int COMP_STRIDE, int NUM_PTS> 41 inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, 42 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ r_u, CeedScalar *d_u) { 43 if (p < points_in_elem) { 44 const CeedInt ind = indices[p + elem * NUM_PTS]; 45 46 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 47 d_u[ind + comp * COMP_STRIDE] += r_u[comp]; 48 } 49 } 50 } 51 52 //------------------------------------------------------------------------------ 53 // 1D 54 //------------------------------------------------------------------------------ 55 56 //------------------------------------------------------------------------------ 57 // L-vector -> E-vector, offsets provided 58 //------------------------------------------------------------------------------ 59 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 60 inline __device__ void ReadLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 61 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 62 if (data.t_id_x < P_1D) { 63 const CeedInt node = data.t_id_x; 64 const CeedInt ind = indices[node + elem * P_1D]; 65 66 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 67 } 68 } 69 70 //------------------------------------------------------------------------------ 71 // L-vector -> E-vector, strided 72 //------------------------------------------------------------------------------ 73 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 74 inline __device__ void ReadLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 75 CeedScalar *__restrict__ r_u) { 76 if (data.t_id_x < P_1D) { 77 const CeedInt node = data.t_id_x; 78 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 79 80 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 81 } 82 } 83 84 //------------------------------------------------------------------------------ 85 // E-vector -> L-vector, offsets provided 86 //------------------------------------------------------------------------------ 87 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 88 inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 89 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 90 if (data.t_id_x < P_1D) { 91 const CeedInt node = data.t_id_x; 92 const CeedInt ind = indices[node + elem * P_1D]; 93 94 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 95 } 96 } 97 98 //------------------------------------------------------------------------------ 99 // E-vector -> L-vector, strided 100 //------------------------------------------------------------------------------ 101 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 102 inline __device__ void WriteLVecStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 103 CeedScalar *__restrict__ d_v) { 104 if (data.t_id_x < P_1D) { 105 const CeedInt node = data.t_id_x; 106 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 107 108 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 109 } 110 } 111 112 //------------------------------------------------------------------------------ 113 // 2D 114 //------------------------------------------------------------------------------ 115 116 //------------------------------------------------------------------------------ 117 // L-vector -> E-vector, offsets provided 118 //------------------------------------------------------------------------------ 119 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 120 inline __device__ void ReadLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 121 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 122 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 123 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 124 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 125 126 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 127 } 128 } 129 130 //------------------------------------------------------------------------------ 131 // L-vector -> E-vector, strided 132 //------------------------------------------------------------------------------ 133 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 134 inline __device__ void ReadLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 135 CeedScalar *__restrict__ r_u) { 136 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 137 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 138 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 139 140 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 141 } 142 } 143 144 //------------------------------------------------------------------------------ 145 // E-vector -> L-vector, offsets provided 146 //------------------------------------------------------------------------------ 147 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 148 inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 149 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 150 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 151 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 152 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 153 154 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); 155 } 156 } 157 158 //------------------------------------------------------------------------------ 159 // E-vector -> L-vector, strided 160 //------------------------------------------------------------------------------ 161 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 162 inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 163 CeedScalar *__restrict__ d_v) { 164 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 165 const CeedInt node = data.t_id_x + data.t_id_y * P_1D; 166 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 167 168 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 169 } 170 } 171 172 //------------------------------------------------------------------------------ 173 // 3D 174 //------------------------------------------------------------------------------ 175 176 //------------------------------------------------------------------------------ 177 // L-vector -> E-vector, offsets provided 178 //------------------------------------------------------------------------------ 179 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 180 inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 181 const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) { 182 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 183 for (CeedInt z = 0; z < P_1D; z++) { 184 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 185 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 186 187 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + COMP_STRIDE * comp]; 188 } 189 } 190 } 191 192 //------------------------------------------------------------------------------ 193 // L-vector -> E-vector, strided 194 //------------------------------------------------------------------------------ 195 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 196 inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, 197 CeedScalar *__restrict__ r_u) { 198 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 199 for (CeedInt z = 0; z < P_1D; z++) { 200 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 201 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 202 203 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1D] = d_u[ind + comp * STRIDES_COMP]; 204 } 205 } 206 } 207 208 //------------------------------------------------------------------------------ 209 // E-vector -> Q-vector, offests provided 210 //------------------------------------------------------------------------------ 211 template <int NUM_COMP, int COMP_STRIDE, int Q_1D> 212 inline __device__ void ReadEVecSliceStandard3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, 213 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, 214 CeedScalar *__restrict__ r_u) { 215 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 216 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 217 const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; 218 219 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + COMP_STRIDE * comp]; 220 } 221 } 222 223 //------------------------------------------------------------------------------ 224 // E-vector -> Q-vector, strided 225 //------------------------------------------------------------------------------ 226 template <int NUM_COMP, int Q_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 227 inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, 228 CeedScalar *__restrict__ r_u) { 229 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 230 const CeedInt node = data.t_id_x + data.t_id_y * Q_1D + q * Q_1D * Q_1D; 231 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 232 233 for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 234 } 235 } 236 237 //------------------------------------------------------------------------------ 238 // E-vector -> L-vector, offsets provided 239 //------------------------------------------------------------------------------ 240 template <int NUM_COMP, int COMP_STRIDE, int P_1D> 241 inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, 242 const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { 243 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 244 for (CeedInt z = 0; z < P_1D; z++) { 245 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 246 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 247 248 for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1D]); 249 } 250 } 251 } 252 253 //------------------------------------------------------------------------------ 254 // E-vector -> L-vector, strided 255 //------------------------------------------------------------------------------ 256 template <int NUM_COMP, int P_1D, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 257 inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v, 258 CeedScalar *__restrict__ d_v) { 259 if (data.t_id_x < P_1D && data.t_id_y < P_1D) { 260 for (CeedInt z = 0; z < P_1D; z++) { 261 const CeedInt node = data.t_id_x + data.t_id_y * P_1D + z * P_1D * P_1D; 262 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 263 264 for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1D]; 265 } 266 } 267 } 268 269 //------------------------------------------------------------------------------ 270 // 3D collocated derivatives computation 271 //------------------------------------------------------------------------------ 272 template <int NUM_COMP, int Q_1D, int T_1D> 273 inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 274 CeedScalar *__restrict__ r_V) { 275 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 276 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 277 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D]; 278 __syncthreads(); 279 // X derivative 280 r_V[comp + 0 * NUM_COMP] = 0.0; 281 for (CeedInt i = 0; i < Q_1D; i++) { 282 r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 283 } 284 // Y derivative 285 r_V[comp + 1 * NUM_COMP] = 0.0; 286 for (CeedInt i = 0; i < Q_1D; i++) { 287 r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 288 } 289 // Z derivative 290 r_V[comp + 2 * NUM_COMP] = 0.0; 291 for (CeedInt i = 0; i < Q_1D; i++) { 292 r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; 293 } 294 __syncthreads(); 295 } 296 } 297 } 298 299 //------------------------------------------------------------------------------ 300 // 3D collocated derivatives transpose 301 //------------------------------------------------------------------------------ 302 template <int NUM_COMP, int Q_1D, int T_1D> 303 inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 304 CeedScalar *__restrict__ r_V) { 305 if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) { 306 for (CeedInt comp = 0; comp < NUM_COMP; comp++) { 307 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP]; 308 __syncthreads(); 309 // X derivative 310 for (CeedInt i = 0; i < Q_1D; i++) { 311 r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D]; 312 } 313 __syncthreads(); 314 // Y derivative 315 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP]; 316 __syncthreads(); 317 for (CeedInt i = 0; i < Q_1D; i++) { 318 r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D]; 319 } 320 __syncthreads(); 321 // Z derivative 322 for (CeedInt i = 0; i < Q_1D; i++) { 323 r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP]; 324 } 325 } 326 } 327 } 328