1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for CUDA shared memory tensor product basis 10 #include <ceed/types.h> 11 12 #include "cuda-shared-basis-read-write-templates.h" 13 #include "cuda-shared-basis-tensor-templates.h" 14 15 //------------------------------------------------------------------------------ 16 // Interp kernel by dim 17 //------------------------------------------------------------------------------ 18 extern "C" __global__ void Interp(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 19 extern __shared__ CeedScalar slice[]; 20 21 SharedData_Cuda data; 22 data.t_id_x = threadIdx.x; 23 data.t_id_y = threadIdx.y; 24 data.t_id_z = threadIdx.z; 25 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 26 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 27 28 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 29 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 30 31 // load interp_1d into shared memory 32 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 33 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 34 __syncthreads(); 35 36 // Apply basis element by element 37 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 38 if (BASIS_DIM == 1) { 39 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 40 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); 41 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); 42 } else if (BASIS_DIM == 2) { 43 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 44 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); 45 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); 46 } else if (BASIS_DIM == 3) { 47 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 48 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 49 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); 50 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 51 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); 52 } 53 } 54 } 55 56 extern "C" __global__ void InterpTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, 57 CeedScalar *__restrict__ d_V) { 58 extern __shared__ CeedScalar slice[]; 59 60 SharedData_Cuda data; 61 data.t_id_x = threadIdx.x; 62 data.t_id_y = threadIdx.y; 63 data.t_id_z = threadIdx.z; 64 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 65 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 66 67 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 68 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 69 70 // load interp_1d into shared memory 71 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 72 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 73 __syncthreads(); 74 75 // Apply basis element by element 76 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 77 if (BASIS_DIM == 1) { 78 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); 79 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); 80 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 81 } else if (BASIS_DIM == 2) { 82 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 83 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); 84 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 85 } else if (BASIS_DIM == 3) { 86 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 87 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 88 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); 89 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 90 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 91 } 92 } 93 } 94 95 extern "C" __global__ void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *__restrict__ d_U, 96 CeedScalar *__restrict__ d_V) { 97 extern __shared__ CeedScalar slice[]; 98 99 SharedData_Cuda data; 100 data.t_id_x = threadIdx.x; 101 data.t_id_y = threadIdx.y; 102 data.t_id_z = threadIdx.z; 103 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 104 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 105 106 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 107 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 108 109 // load interp_1d into shared memory 110 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 111 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 112 __syncthreads(); 113 114 // Apply basis element by element 115 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 116 if (BASIS_DIM == 1) { 117 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); 118 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); 119 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 120 } else if (BASIS_DIM == 2) { 121 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 122 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); 123 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 124 } else if (BASIS_DIM == 3) { 125 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 126 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 127 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); 128 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 129 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 130 } 131 } 132 } 133 134 //------------------------------------------------------------------------------ 135 // Grad kernel by dim 136 //------------------------------------------------------------------------------ 137 extern "C" __global__ void Grad(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 138 CeedScalar *__restrict__ d_V) { 139 extern __shared__ CeedScalar slice[]; 140 141 SharedData_Cuda data; 142 data.t_id_x = threadIdx.x; 143 data.t_id_y = threadIdx.y; 144 data.t_id_z = threadIdx.z; 145 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 146 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 147 148 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 149 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 150 151 // load interp_1d and grad_1d into shared memory 152 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 153 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 154 __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; 155 LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G); 156 __syncthreads(); 157 158 // Apply basis element by element 159 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 160 if (BASIS_DIM == 1) { 161 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 162 Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); 163 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); 164 } else if (BASIS_DIM == 2) { 165 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 166 GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); 167 WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, 168 d_V); 169 } else if (BASIS_DIM == 3) { 170 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 171 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 172 if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); 173 else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); 174 WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 175 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); 176 } 177 } 178 } 179 180 extern "C" __global__ void GradTranspose(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 181 CeedScalar *__restrict__ d_V) { 182 extern __shared__ CeedScalar slice[]; 183 184 SharedData_Cuda data; 185 data.t_id_x = threadIdx.x; 186 data.t_id_y = threadIdx.y; 187 data.t_id_z = threadIdx.z; 188 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 189 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 190 191 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 192 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 193 194 // load interp_1d and grad_1d into shared memory 195 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 196 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 197 __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; 198 LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G); 199 __syncthreads(); 200 201 // Apply basis element by element 202 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 203 if (BASIS_DIM == 1) { 204 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); 205 GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); 206 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 207 } else if (BASIS_DIM == 2) { 208 ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, 209 r_U); 210 GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); 211 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 212 } else if (BASIS_DIM == 3) { 213 ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 214 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 215 if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); 216 else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); 217 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 218 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 219 } 220 } 221 } 222 223 extern "C" __global__ void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *c_B, const CeedScalar *c_G, const CeedScalar *__restrict__ d_U, 224 CeedScalar *__restrict__ d_V) { 225 extern __shared__ CeedScalar slice[]; 226 227 SharedData_Cuda data; 228 data.t_id_x = threadIdx.x; 229 data.t_id_y = threadIdx.y; 230 data.t_id_z = threadIdx.z; 231 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 232 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 233 234 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 235 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 236 237 // load interp_1d and grad_1d into shared memory 238 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 239 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); 240 __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; 241 LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G); 242 __syncthreads(); 243 244 // Apply basis element by element 245 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 246 if (BASIS_DIM == 1) { 247 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); 248 GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); 249 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 250 } else if (BASIS_DIM == 2) { 251 ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, 252 r_U); 253 GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); 254 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 255 } else if (BASIS_DIM == 3) { 256 ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 257 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 258 if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); 259 else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); 260 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 261 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 262 } 263 } 264 } 265 266 //------------------------------------------------------------------------------ 267 // Weight kernels by dim 268 //------------------------------------------------------------------------------ 269 extern "C" __global__ void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) { 270 extern __shared__ CeedScalar slice[]; 271 272 SharedData_Cuda data; 273 data.t_id_x = threadIdx.x; 274 data.t_id_y = threadIdx.y; 275 data.t_id_z = threadIdx.z; 276 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 277 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); 278 279 CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1]; 280 281 // Apply basis element by element 282 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 283 if (BASIS_DIM == 1) { 284 Weight1d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W); 285 WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W); 286 } else if (BASIS_DIM == 2) { 287 WeightTensor2d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W); 288 WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W); 289 } else if (BASIS_DIM == 3) { 290 WeightTensor3d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W); 291 WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W, 292 d_W); 293 } 294 } 295 } 296