1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for SYCL backend macro and type definitions for JiT source 10 #include <ceed/types.h> 11 12 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable 13 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable 14 // TODO: Handle FP32 case 15 typedef atomic_double CeedAtomicScalar; 16 17 //------------------------------------------------------------------------------ 18 // Load matrices for basis actions 19 //------------------------------------------------------------------------------ 20 inline void loadMatrix(const CeedInt N, const CeedScalar *restrict d_B, CeedScalar *restrict B) { 21 const CeedInt item_id = get_local_linear_id(); 22 const CeedInt group_size = get_local_size(0) * get_local_size(1) * get_local_size(2); 23 for (CeedInt i = item_id; i < N; i += group_size) B[i] = d_B[i]; 24 } 25 26 //------------------------------------------------------------------------------ 27 // 1D 28 //------------------------------------------------------------------------------ 29 30 //------------------------------------------------------------------------------ 31 // L-vector -> E-vector, offsets provided 32 //------------------------------------------------------------------------------ 33 inline void readDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, 34 const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) { 35 const CeedInt item_id_x = get_local_id(0); 36 const CeedInt elem = get_global_id(2); 37 38 if (item_id_x < P_1D && elem < num_elem) { 39 const CeedInt node = item_id_x; 40 const CeedInt ind = indices[node + elem * P_1D]; 41 for (CeedInt comp = 0; comp < num_comp; ++comp) { 42 r_u[comp] = d_u[ind + strides_comp * comp]; 43 } 44 } 45 } 46 47 //------------------------------------------------------------------------------ 48 // L-vector -> E-vector, strided 49 //------------------------------------------------------------------------------ 50 inline void readDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, 51 const CeedInt strides_elem, const CeedInt num_elem, global const CeedScalar *restrict d_u, 52 private CeedScalar *restrict r_u) { 53 const CeedInt item_id_x = get_local_id(0); 54 const CeedInt elem = get_global_id(2); 55 56 if (item_id_x < P_1D && elem < num_elem) { 57 const CeedInt node = item_id_x; 58 const CeedInt ind = node * strides_node + elem * strides_elem; 59 for (CeedInt comp = 0; comp < num_comp; comp++) { 60 r_u[comp] = d_u[ind + comp * strides_comp]; 61 } 62 } 63 } 64 65 //------------------------------------------------------------------------------ 66 // E-vector -> L-vector, offsets provided 67 //------------------------------------------------------------------------------ 68 inline void writeDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, 69 const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) { 70 const CeedInt item_id_x = get_local_id(0); 71 const CeedInt elem = get_global_id(2); 72 73 if (item_id_x < P_1D && elem < num_elem) { 74 const CeedInt node = item_id_x; 75 const CeedInt ind = indices[node + elem * P_1D]; 76 for (CeedInt comp = 0; comp < num_comp; ++comp) 77 atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device); 78 } 79 } 80 81 //------------------------------------------------------------------------------ 82 // E-vector -> L-vector, strided 83 //------------------------------------------------------------------------------ 84 inline void writeDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, 85 const CeedInt strides_elem, const CeedInt num_elem, private const CeedScalar *restrict r_v, 86 global CeedScalar *restrict d_v) { 87 const CeedInt item_id_x = get_local_id(0); 88 const CeedInt elem = get_global_id(2); 89 90 if (item_id_x < P_1D && elem < num_elem) { 91 const CeedInt node = item_id_x; 92 const CeedInt ind = node * strides_node + elem * strides_elem; 93 for (CeedInt comp = 0; comp < num_comp; comp++) { 94 d_v[ind + comp * strides_comp] = r_v[comp]; 95 } 96 } 97 } 98 99 //------------------------------------------------------------------------------ 100 // 2D 101 //------------------------------------------------------------------------------ 102 103 //------------------------------------------------------------------------------ 104 // L-vector -> E-vector, offsets provided 105 //------------------------------------------------------------------------------ 106 inline void readDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, 107 const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) { 108 const CeedInt item_id_x = get_local_id(0); 109 const CeedInt item_id_y = get_local_id(1); 110 const CeedInt elem = get_global_id(2); 111 112 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 113 const CeedInt node = item_id_x + item_id_y * P_1D; 114 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 115 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp]; 116 } 117 } 118 119 //------------------------------------------------------------------------------ 120 // L-vector -> E-vector, strided 121 //------------------------------------------------------------------------------ 122 inline void readDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, 123 const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar *restrict d_u, 124 private CeedScalar *restrict r_u) { 125 const CeedInt item_id_x = get_local_id(0); 126 const CeedInt item_id_y = get_local_id(1); 127 const CeedInt elem = get_global_id(2); 128 129 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 130 const CeedInt node = item_id_x + item_id_y * P_1D; 131 const CeedInt ind = node * strides_node + elem * strides_elem; 132 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp]; 133 } 134 } 135 136 //------------------------------------------------------------------------------ 137 // E-vector -> L-vector, offsets provided 138 //------------------------------------------------------------------------------ 139 inline void writeDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, 140 const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) { 141 const CeedInt item_id_x = get_local_id(0); 142 const CeedInt item_id_y = get_local_id(1); 143 const CeedInt elem = get_global_id(2); 144 145 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 146 const CeedInt node = item_id_x + item_id_y * P_1D; 147 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 148 for (CeedInt comp = 0; comp < num_comp; ++comp) 149 atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device); 150 } 151 } 152 153 //------------------------------------------------------------------------------ 154 // E-vector -> L-vector, strided 155 //------------------------------------------------------------------------------ 156 inline void writeDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, 157 const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar *restrict r_v, 158 global CeedScalar *restrict d_v) { 159 const CeedInt item_id_x = get_local_id(0); 160 const CeedInt item_id_y = get_local_id(1); 161 const CeedInt elem = get_global_id(2); 162 163 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 164 const CeedInt node = item_id_x + item_id_y * P_1D; 165 const CeedInt ind = node * strides_node + elem * strides_elem; 166 for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[comp]; 167 } 168 } 169 170 //------------------------------------------------------------------------------ 171 // 3D 172 //------------------------------------------------------------------------------ 173 174 //------------------------------------------------------------------------------ 175 // L-vector -> E-vector, offsets provided 176 //------------------------------------------------------------------------------ 177 inline void readDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, 178 const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) { 179 const CeedInt item_id_x = get_local_id(0); 180 const CeedInt item_id_y = get_local_id(1); 181 const CeedInt elem = get_global_id(2); 182 183 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 184 for (CeedInt z = 0; z < P_1D; ++z) { 185 const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z); 186 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 187 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + strides_comp * comp]; 188 } 189 } 190 } 191 192 //------------------------------------------------------------------------------ 193 // L-vector -> E-vector, strided 194 //------------------------------------------------------------------------------ 195 inline void readDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, 196 const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar *restrict d_u, 197 private CeedScalar *restrict r_u) { 198 const CeedInt item_id_x = get_local_id(0); 199 const CeedInt item_id_y = get_local_id(1); 200 const CeedInt elem = get_global_id(2); 201 202 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 203 for (CeedInt z = 0; z < P_1D; ++z) { 204 const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z); 205 const CeedInt ind = node * strides_node + elem * strides_elem; 206 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp]; 207 } 208 } 209 } 210 211 //------------------------------------------------------------------------------ 212 // E-vector -> Q-vector, offests provided 213 //------------------------------------------------------------------------------ 214 inline void readSliceQuadsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt Q_1D, const CeedInt num_elem, const CeedInt q, 215 const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) { 216 const CeedInt item_id_x = get_local_id(0); 217 const CeedInt item_id_y = get_local_id(1); 218 const CeedInt elem = get_global_id(2); 219 220 if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) { 221 const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q); 222 const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; 223 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp]; 224 } 225 } 226 227 //------------------------------------------------------------------------------ 228 // E-vector -> Q-vector, strided 229 //------------------------------------------------------------------------------ 230 inline void readSliceQuadsStrided3d(const CeedInt num_comp, const CeedInt Q_1D, CeedInt strides_node, CeedInt strides_comp, CeedInt strides_elem, 231 const CeedInt num_elem, const CeedInt q, const global CeedScalar *restrict d_u, 232 private CeedScalar *restrict r_u) { 233 const CeedInt item_id_x = get_local_id(0); 234 const CeedInt item_id_y = get_local_id(1); 235 const CeedInt elem = get_global_id(2); 236 237 if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) { 238 const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q); 239 const CeedInt ind = node * strides_node + elem * strides_elem; 240 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp]; 241 } 242 } 243 244 //------------------------------------------------------------------------------ 245 // E-vector -> L-vector, offsets provided 246 //------------------------------------------------------------------------------ 247 inline void writeDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, 248 const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) { 249 const CeedInt item_id_x = get_local_id(0); 250 const CeedInt item_id_y = get_local_id(1); 251 const CeedInt elem = get_global_id(2); 252 253 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 254 for (CeedInt z = 0; z < P_1D; ++z) { 255 const CeedInt node = item_id_x + item_id_y * P_1D + z * P_1D * P_1D; 256 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 257 for (CeedInt comp = 0; comp < num_comp; ++comp) 258 atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[z + comp * P_1D], memory_order_relaxed, memory_scope_device); 259 } 260 } 261 } 262 263 //------------------------------------------------------------------------------ 264 // E-vector -> L-vector, strided 265 //------------------------------------------------------------------------------ 266 inline void writeDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, 267 const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar *restrict r_v, 268 global CeedScalar *restrict d_v) { 269 const CeedInt item_id_x = get_local_id(0); 270 const CeedInt item_id_y = get_local_id(1); 271 const CeedInt elem = get_global_id(2); 272 273 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 274 for (CeedInt z = 0; z < P_1D; ++z) { 275 const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z); 276 const CeedInt ind = node * strides_node + elem * strides_elem; 277 for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D]; 278 } 279 } 280 } 281 282 //------------------------------------------------------------------------------ 283 // 3D collocated derivatives computation 284 //------------------------------------------------------------------------------ 285 inline void gradCollo3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar *restrict r_U, 286 const local CeedScalar *s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 287 const CeedInt item_id_x = get_local_id(0); 288 const CeedInt item_id_y = get_local_id(1); 289 290 for (CeedInt comp = 0; comp < num_comp; ++comp) { 291 if (item_id_x < Q_1D && item_id_y < Q_1D) { 292 scratch[item_id_x + item_id_y * T_1D] = r_U[q + comp * Q_1D]; 293 } 294 work_group_barrier(CLK_LOCAL_MEM_FENCE); 295 296 if (item_id_x < Q_1D && item_id_y < Q_1D) { 297 // X derivative 298 r_V[comp + 0 * num_comp] = 0.0; 299 for (CeedInt i = 0; i < Q_1D; ++i) 300 r_V[comp + 0 * num_comp] += s_G[i + item_id_x * Q_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction (X derivative) 301 302 // Y derivative 303 r_V[comp + 1 * num_comp] = 0.0; 304 for (CeedInt i = 0; i < Q_1D; ++i) 305 r_V[comp + 1 * num_comp] += s_G[i + item_id_y * Q_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction (Y derivative) 306 307 // Z derivative 308 r_V[comp + 2 * num_comp] = 0.0; 309 for (CeedInt i = 0; i < Q_1D; ++i) r_V[comp + 2 * num_comp] += s_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; // Contract z direction (Z derivative) 310 } 311 312 work_group_barrier(CLK_LOCAL_MEM_FENCE); 313 } 314 } 315 316 //------------------------------------------------------------------------------ 317 // 3D collocated derivatives transpose 318 //------------------------------------------------------------------------------ 319 inline void gradColloTranspose3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar *restrict r_U, 320 const local CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) { 321 const CeedInt item_id_x = get_local_id(0); 322 const CeedInt item_id_y = get_local_id(1); 323 324 for (CeedInt comp = 0; comp < num_comp; ++comp) { 325 // X derivative 326 if (item_id_x < Q_1D && item_id_y < Q_1D) { 327 scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 0 * num_comp]; 328 } 329 work_group_barrier(CLK_LOCAL_MEM_FENCE); 330 331 if (item_id_x < Q_1D && item_id_y < Q_1D) { 332 for (CeedInt i = 0; i < Q_1D; ++i) 333 r_V[q + comp * Q_1D] += s_G[item_id_x + i * Q_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction (X derivative) 334 } 335 work_group_barrier(CLK_LOCAL_MEM_FENCE); 336 337 // Y derivative 338 if (item_id_x < Q_1D && item_id_y < Q_1D) { 339 scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 1 * num_comp]; 340 } 341 work_group_barrier(CLK_LOCAL_MEM_FENCE); 342 343 if (item_id_x < Q_1D && item_id_y < Q_1D) { 344 for (CeedInt i = 0; i < Q_1D; ++i) 345 r_V[q + comp * Q_1D] += s_G[item_id_y + i * Q_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction (Y derivative) 346 } 347 work_group_barrier(CLK_LOCAL_MEM_FENCE); 348 349 // Z derivative 350 if (item_id_x < Q_1D && item_id_y < Q_1D) { 351 for (CeedInt i = 0; i < Q_1D; ++i) 352 r_V[i + comp * Q_1D] += s_G[i + q * Q_1D] * r_U[comp + 2 * num_comp]; // PARTIAL contract z direction (Z derivative) 353 } 354 } 355 } 356