1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for HIP shared memory tensor product basis 10 11 #include <ceed.h> 12 13 #include "hip-shared-basis-read-write-templates.h" 14 #include "hip-shared-basis-tensor-templates.h" 15 16 //------------------------------------------------------------------------------ 17 // Interp kernel by dim 18 //------------------------------------------------------------------------------ 19 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 20 void Interp(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 21 extern __shared__ CeedScalar slice[]; 22 23 // load interp_1d into shared memory 24 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 25 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 26 __syncthreads(); 27 28 SharedData_Hip data; 29 data.t_id_x = threadIdx.x; 30 data.t_id_y = threadIdx.y; 31 data.t_id_z = threadIdx.z; 32 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 33 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 34 35 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 36 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 37 38 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 39 if (BASIS_DIM == 1) { 40 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 41 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 42 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); 43 } else if (BASIS_DIM == 2) { 44 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 45 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 46 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); 47 } else if (BASIS_DIM == 3) { 48 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 49 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 50 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 51 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 52 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); 53 } 54 } 55 } 56 57 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 58 void InterpTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 59 extern __shared__ CeedScalar slice[]; 60 61 // load interp_1d into shared memory 62 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 63 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 64 __syncthreads(); 65 66 SharedData_Hip data; 67 data.t_id_x = threadIdx.x; 68 data.t_id_y = threadIdx.y; 69 data.t_id_z = threadIdx.z; 70 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 71 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 72 73 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 74 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 75 76 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 77 if (BASIS_DIM == 1) { 78 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); 79 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 80 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 81 } else if (BASIS_DIM == 2) { 82 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 83 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 84 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 85 } else if (BASIS_DIM == 3) { 86 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 87 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 88 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 89 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 90 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 91 } 92 } 93 } 94 95 extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ 96 void InterpTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { 97 extern __shared__ CeedScalar slice[]; 98 99 // load interp_1d into shared memory 100 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 101 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 102 __syncthreads(); 103 104 SharedData_Hip data; 105 data.t_id_x = threadIdx.x; 106 data.t_id_y = threadIdx.y; 107 data.t_id_z = threadIdx.z; 108 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 109 data.slice = &slice[data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1)]; 110 111 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 112 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 113 114 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 115 if (BASIS_DIM == 1) { 116 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); 117 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 118 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 119 } else if (BASIS_DIM == 2) { 120 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 121 InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 122 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 123 } else if (BASIS_DIM == 3) { 124 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 125 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 126 InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_V); 127 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 128 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 129 } 130 } 131 } 132 133 //------------------------------------------------------------------------------ 134 // Grad kernel by dim 135 //------------------------------------------------------------------------------ 136 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ 137 void Grad(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, 138 CeedScalar *__restrict__ d_V) { 139 extern __shared__ CeedScalar slice[]; 140 141 // load interp_1d and grad_1d into shared memory 142 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 143 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 144 __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; 145 loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G); 146 __syncthreads(); 147 148 SharedData_Hip data; 149 data.t_id_x = threadIdx.x; 150 data.t_id_y = threadIdx.y; 151 data.t_id_z = threadIdx.z; 152 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 153 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 154 155 CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 156 CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 157 158 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 159 if (BASIS_DIM == 1) { 160 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); 161 Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 162 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_V, d_V); 163 } else if (BASIS_DIM == 2) { 164 ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); 165 GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 166 WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_V, 167 d_V); 168 } else if (BASIS_DIM == 3) { 169 ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 170 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); 171 if (BASIS_HAS_COLLOCATED_GRAD) GradTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 172 else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 173 WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 174 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_V, d_V); 175 } 176 } 177 } 178 179 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ 180 void GradTranspose(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, 181 CeedScalar *__restrict__ d_V) { 182 extern __shared__ CeedScalar slice[]; 183 184 // load interp_1d and grad_1d into shared memory 185 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 186 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 187 __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; 188 loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G); 189 __syncthreads(); 190 191 SharedData_Hip data; 192 data.t_id_x = threadIdx.x; 193 data.t_id_y = threadIdx.y; 194 data.t_id_z = threadIdx.z; 195 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 196 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 197 198 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 199 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 200 201 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 202 if (BASIS_DIM == 1) { 203 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); 204 GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 205 WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 206 } else if (BASIS_DIM == 2) { 207 ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, 208 r_U); 209 GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 210 WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 211 } else if (BASIS_DIM == 3) { 212 ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 213 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 214 if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 215 else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 216 WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 217 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 218 } 219 } 220 } 221 222 extern "C" __launch_bounds__(BASIS_GRAD_BLOCK_SIZE) __global__ 223 void GradTransposeAdd(const CeedInt num_elem, const CeedScalar *d_interp_1d, const CeedScalar *d_grad_1d, const CeedScalar *__restrict__ d_U, 224 CeedScalar *__restrict__ d_V) { 225 extern __shared__ CeedScalar slice[]; 226 227 // load interp_1d and grad_1d into shared memory 228 __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; 229 loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_interp_1d, s_B); 230 __shared__ CeedScalar s_G[BASIS_Q_1D * (BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)]; 231 loadMatrix<BASIS_Q_1D *(BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D)>(d_grad_1d, s_G); 232 __syncthreads(); 233 234 SharedData_Hip data; 235 data.t_id_x = threadIdx.x; 236 data.t_id_y = threadIdx.y; 237 data.t_id_z = threadIdx.z; 238 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 239 data.slice = &slice[data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1)]; 240 241 CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; 242 CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; 243 244 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 245 if (BASIS_DIM == 1) { 246 ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, d_U, r_U); 247 GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 248 SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); 249 } else if (BASIS_DIM == 2) { 250 ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, d_U, 251 r_U); 252 GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 253 SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); 254 } else if (BASIS_DIM == 3) { 255 ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, 256 BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, d_U, r_U); 257 if (BASIS_HAS_COLLOCATED_GRAD) GradTransposeTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 258 else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, s_G, r_V); 259 SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, 260 BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); 261 } 262 } 263 } 264 265 //------------------------------------------------------------------------------ 266 // Weight kernels by dim 267 //------------------------------------------------------------------------------ 268 extern "C" __launch_bounds__(BASIS_WEIGHT_BLOCK_SIZE) __global__ 269 void Weight(const CeedInt num_elem, const CeedScalar *__restrict__ q_weight_1d, CeedScalar *__restrict__ d_W) { 270 extern __shared__ CeedScalar slice[]; 271 272 SharedData_Hip data; 273 data.t_id_x = threadIdx.x; 274 data.t_id_y = threadIdx.y; 275 data.t_id_z = threadIdx.z; 276 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; 277 data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); 278 279 CeedScalar r_W[BASIS_DIM > 2 ? BASIS_Q_1D : 1]; 280 281 for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { 282 if (BASIS_DIM == 1) { 283 Weight1d<BASIS_Q_1D>(data, q_weight_1d, r_W); 284 WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W); 285 } else if (BASIS_DIM == 2) { 286 WeightTensor2d<BASIS_Q_1D>(data, q_weight_1d, r_W); 287 WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D, r_W, d_W); 288 } else if (BASIS_DIM == 3) { 289 WeightTensor3d<BASIS_Q_1D>(data, q_weight_1d, r_W); 290 WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D, r_W, 291 d_W); 292 } 293 } 294 } 295