1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA backend macro and type definitions for JiT source 10 #ifndef _ceed_cuda_gen_templates_h 11 #define _ceed_cuda_gen_templates_h 12 13 #include <ceed/types.h> 14 15 //------------------------------------------------------------------------------ 16 // Load matrices for basis actions 17 //------------------------------------------------------------------------------ 18 template <int P, int Q> 19 inline __device__ void loadMatrix(SharedData_Cuda &data, const CeedScalar *__restrict__ d_B, CeedScalar *B) { 20 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; 21 } 22 23 //------------------------------------------------------------------------------ 24 // 1D 25 //------------------------------------------------------------------------------ 26 27 //------------------------------------------------------------------------------ 28 // L-vector -> E-vector, offsets provided 29 //------------------------------------------------------------------------------ 30 template <int NCOMP, int COMPSTRIDE, int P1d> 31 inline __device__ void readDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, 32 const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 33 if (data.t_id_x < P1d) { 34 const CeedInt node = data.t_id_x; 35 const CeedInt ind = indices[node + elem * P1d]; 36 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 37 } 38 } 39 40 //------------------------------------------------------------------------------ 41 // L-vector -> E-vector, strided 42 //------------------------------------------------------------------------------ 43 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 44 inline __device__ void readDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 45 if (data.t_id_x < P1d) { 46 const CeedInt node = data.t_id_x; 47 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 48 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 49 } 50 } 51 52 //------------------------------------------------------------------------------ 53 // E-vector -> L-vector, offsets provided 54 //------------------------------------------------------------------------------ 55 template <int NCOMP, int COMPSTRIDE, int P1d> 56 inline __device__ void writeDofsOffset1d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, 57 const CeedScalar *r_v, CeedScalar *d_v) { 58 if (data.t_id_x < P1d) { 59 const CeedInt node = data.t_id_x; 60 const CeedInt ind = indices[node + elem * P1d]; 61 for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 62 } 63 } 64 65 //------------------------------------------------------------------------------ 66 // E-vector -> L-vector, strided 67 //------------------------------------------------------------------------------ 68 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 69 inline __device__ void writeDofsStrided1d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 70 if (data.t_id_x < P1d) { 71 const CeedInt node = data.t_id_x; 72 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 73 for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 74 } 75 } 76 77 //------------------------------------------------------------------------------ 78 // 2D 79 //------------------------------------------------------------------------------ 80 81 //------------------------------------------------------------------------------ 82 // L-vector -> E-vector, offsets provided 83 //------------------------------------------------------------------------------ 84 template <int NCOMP, int COMPSTRIDE, int P1d> 85 inline __device__ void readDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, 86 const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 87 if (data.t_id_x < P1d && data.t_id_y < P1d) { 88 const CeedInt node = data.t_id_x + data.t_id_y * P1d; 89 const CeedInt ind = indices[node + elem * P1d * P1d]; 90 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 91 } 92 } 93 94 //------------------------------------------------------------------------------ 95 // L-vector -> E-vector, strided 96 //------------------------------------------------------------------------------ 97 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 98 inline __device__ void readDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 99 if (data.t_id_x < P1d && data.t_id_y < P1d) { 100 const CeedInt node = data.t_id_x + data.t_id_y * P1d; 101 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 102 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 103 } 104 } 105 106 //------------------------------------------------------------------------------ 107 // E-vector -> L-vector, offsets provided 108 //------------------------------------------------------------------------------ 109 template <int NCOMP, int COMPSTRIDE, int P1d> 110 inline __device__ void writeDofsOffset2d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, 111 const CeedScalar *r_v, CeedScalar *d_v) { 112 if (data.t_id_x < P1d && data.t_id_y < P1d) { 113 const CeedInt node = data.t_id_x + data.t_id_y * P1d; 114 const CeedInt ind = indices[node + elem * P1d * P1d]; 115 for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[comp]); 116 } 117 } 118 119 //------------------------------------------------------------------------------ 120 // E-vector -> L-vector, strided 121 //------------------------------------------------------------------------------ 122 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 123 inline __device__ void writeDofsStrided2d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 124 if (data.t_id_x < P1d && data.t_id_y < P1d) { 125 const CeedInt node = data.t_id_x + data.t_id_y * P1d; 126 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 127 for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[comp]; 128 } 129 } 130 131 //------------------------------------------------------------------------------ 132 // 3D 133 //------------------------------------------------------------------------------ 134 135 //------------------------------------------------------------------------------ 136 // L-vector -> E-vector, offsets provided 137 //------------------------------------------------------------------------------ 138 // TODO: remove "Dofs" and "Quads" in the following function names? 139 // - readDofsOffset3d -> readOffset3d ? 140 // - readDofsStrided3d -> readStrided3d ? 141 // - readSliceQuadsOffset3d -> readSliceOffset3d ? 142 // - readSliceQuadsStrided3d -> readSliceStrided3d ? 143 // - writeDofsOffset3d -> writeOffset3d ? 144 // - writeDofsStrided3d -> writeStrided3d ? 145 template <int NCOMP, int COMPSTRIDE, int P1d> 146 inline __device__ void readDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, 147 const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 148 if (data.t_id_x < P1d && data.t_id_y < P1d) 149 for (CeedInt z = 0; z < P1d; ++z) { 150 const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; 151 const CeedInt ind = indices[node + elem * P1d * P1d * P1d]; 152 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z + comp * P1d] = d_u[ind + COMPSTRIDE * comp]; 153 } 154 } 155 156 //------------------------------------------------------------------------------ 157 // L-vector -> E-vector, strided 158 //------------------------------------------------------------------------------ 159 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 160 inline __device__ void readDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 161 if (data.t_id_x < P1d && data.t_id_y < P1d) 162 for (CeedInt z = 0; z < P1d; ++z) { 163 const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; 164 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 165 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[z + comp * P1d] = d_u[ind + comp * STRIDES_COMP]; 166 } 167 } 168 169 //------------------------------------------------------------------------------ 170 // E-vector -> Q-vector, offests provided 171 //------------------------------------------------------------------------------ 172 template <int NCOMP, int COMPSTRIDE, int Q1d> 173 inline __device__ void readSliceQuadsOffset3d(SharedData_Cuda &data, const CeedInt nquads, const CeedInt elem, const CeedInt q, 174 const CeedInt *__restrict__ indices, const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { 175 if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 176 const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d; 177 const CeedInt ind = indices[node + elem * Q1d * Q1d * Q1d]; 178 ; 179 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + COMPSTRIDE * comp]; 180 } 181 } 182 183 //------------------------------------------------------------------------------ 184 // E-vector -> Q-vector, strided 185 //------------------------------------------------------------------------------ 186 template <int NCOMP, int Q1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 187 inline __device__ void readSliceQuadsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedInt q, const CeedScalar *__restrict__ d_u, 188 CeedScalar *r_u) { 189 if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 190 const CeedInt node = data.t_id_x + data.t_id_y * Q1d + q * Q1d * Q1d; 191 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 192 for (CeedInt comp = 0; comp < NCOMP; ++comp) r_u[comp] = d_u[ind + comp * STRIDES_COMP]; 193 } 194 } 195 196 //------------------------------------------------------------------------------ 197 // E-vector -> L-vector, offsets provided 198 //------------------------------------------------------------------------------ 199 template <int NCOMP, int COMPSTRIDE, int P1d> 200 inline __device__ void writeDofsOffset3d(SharedData_Cuda &data, const CeedInt nnodes, const CeedInt elem, const CeedInt *__restrict__ indices, 201 const CeedScalar *r_v, CeedScalar *d_v) { 202 if (data.t_id_x < P1d && data.t_id_y < P1d) 203 for (CeedInt z = 0; z < P1d; ++z) { 204 const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; 205 const CeedInt ind = indices[node + elem * P1d * P1d * P1d]; 206 for (CeedInt comp = 0; comp < NCOMP; ++comp) atomicAdd(&d_v[ind + COMPSTRIDE * comp], r_v[z + comp * P1d]); 207 } 208 } 209 210 //------------------------------------------------------------------------------ 211 // E-vector -> L-vector, strided 212 //------------------------------------------------------------------------------ 213 template <int NCOMP, int P1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM> 214 inline __device__ void writeDofsStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *r_v, CeedScalar *d_v) { 215 if (data.t_id_x < P1d && data.t_id_y < P1d) 216 for (CeedInt z = 0; z < P1d; ++z) { 217 const CeedInt node = data.t_id_x + data.t_id_y * P1d + z * P1d * P1d; 218 const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM; 219 for (CeedInt comp = 0; comp < NCOMP; ++comp) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P1d]; 220 } 221 } 222 223 //------------------------------------------------------------------------------ 224 // 3D collocated derivatives computation 225 //------------------------------------------------------------------------------ 226 template <int NCOMP, int Q1d> 227 inline __device__ void gradCollo3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 228 CeedScalar *__restrict__ r_V) { 229 if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 230 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 231 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q1d]; 232 __syncthreads(); 233 // X derivative 234 r_V[comp + 0 * NCOMP] = 0.0; 235 for (CeedInt i = 0; i < Q1d; ++i) 236 r_V[comp + 0 * NCOMP] += c_G[i + data.t_id_x * Q1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) 237 // Y derivative 238 r_V[comp + 1 * NCOMP] = 0.0; 239 for (CeedInt i = 0; i < Q1d; ++i) 240 r_V[comp + 1 * NCOMP] += c_G[i + data.t_id_y * Q1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) 241 // Z derivative 242 r_V[comp + 2 * NCOMP] = 0.0; 243 for (CeedInt i = 0; i < Q1d; ++i) r_V[comp + 2 * NCOMP] += c_G[i + q * Q1d] * r_U[i + comp * Q1d]; // Contract z direction (Z derivative) 244 __syncthreads(); 245 } 246 } 247 } 248 249 //------------------------------------------------------------------------------ 250 // 3D collocated derivatives transpose 251 //------------------------------------------------------------------------------ 252 template <int NCOMP, int Q1d> 253 inline __device__ void gradColloTranspose3d(SharedData_Cuda &data, const CeedInt q, const CeedScalar *__restrict__ r_U, const CeedScalar *c_G, 254 CeedScalar *__restrict__ r_V) { 255 if (data.t_id_x < Q1d && data.t_id_y < Q1d) { 256 for (CeedInt comp = 0; comp < NCOMP; ++comp) { 257 // X derivative 258 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NCOMP]; 259 __syncthreads(); 260 for (CeedInt i = 0; i < Q1d; ++i) 261 r_V[q + comp * Q1d] += c_G[data.t_id_x + i * Q1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative) 262 __syncthreads(); 263 // Y derivative 264 data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NCOMP]; 265 __syncthreads(); 266 for (CeedInt i = 0; i < Q1d; ++i) 267 r_V[q + comp * Q1d] += c_G[data.t_id_y + i * Q1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative) 268 __syncthreads(); 269 // Z derivative 270 for (CeedInt i = 0; i < Q1d; ++i) 271 r_V[i + comp * Q1d] += c_G[i + q * Q1d] * r_U[comp + 2 * NCOMP]; // PARTIAL contract z direction (Z derivative) 272 } 273 } 274 } 275 276 #endif 277