1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3 // 4 // SPDX-License-Identifier: BSD-2-Clause 5 // 6 // This file is part of CEED: http://github.com/ceed 7 8 /// @file 9 /// Internal header for SYCL backend macro and type definitions for JiT source 10 #ifndef CEED_SYCL_GEN_TEMPLATES_H 11 #define CEED_SYCL_GEN_TEMPLATES_H 12 13 #include <ceed/types.h> 14 15 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable 16 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable 17 // TODO: Handle FP32 case 18 typedef atomic_double CeedAtomicScalar; 19 20 //------------------------------------------------------------------------------ 21 // Load matrices for basis actions 22 //------------------------------------------------------------------------------ 23 inline void loadMatrix(const CeedInt N, const CeedScalar* restrict d_B, CeedScalar* restrict B) { 24 const CeedInt item_id = get_local_linear_id(); 25 const CeedInt group_size = get_local_size(0) * get_local_size(1) * get_local_size(2); 26 for (CeedInt i = item_id; i < N; i += group_size) B[i] = d_B[i]; 27 } 28 29 //------------------------------------------------------------------------------ 30 // 1D 31 //------------------------------------------------------------------------------ 32 33 //------------------------------------------------------------------------------ 34 // L-vector -> E-vector, offsets provided 35 //------------------------------------------------------------------------------ 36 inline void readDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, 37 const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) { 38 const CeedInt item_id_x = get_local_id(0); 39 const CeedInt elem = get_global_id(2); 40 41 if (item_id_x < P_1D && elem < num_elem) { 42 const CeedInt node = item_id_x; 43 const CeedInt ind = indices[node + elem * P_1D]; 44 for (CeedInt comp = 0; comp < num_comp; ++comp) { 45 r_u[comp] = d_u[ind + strides_comp * comp]; 46 } 47 } 48 } 49 50 //------------------------------------------------------------------------------ 51 // L-vector -> E-vector, strided 52 //------------------------------------------------------------------------------ 53 inline void readDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, 54 const CeedInt strides_elem, const CeedInt num_elem, global const CeedScalar* restrict d_u, 55 private CeedScalar* restrict r_u) { 56 const CeedInt item_id_x = get_local_id(0); 57 const CeedInt elem = get_global_id(2); 58 59 if (item_id_x < P_1D && elem < num_elem) { 60 const CeedInt node = item_id_x; 61 const CeedInt ind = node * strides_node + elem * strides_elem; 62 for (CeedInt comp = 0; comp < num_comp; comp++) { 63 r_u[comp] = d_u[ind + comp * strides_comp]; 64 } 65 } 66 } 67 68 //------------------------------------------------------------------------------ 69 // E-vector -> L-vector, offsets provided 70 //------------------------------------------------------------------------------ 71 inline void writeDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, 72 const global CeedInt* restrict indices, const private CeedScalar* restrict r_v, global CeedAtomicScalar* restrict d_v) { 73 const CeedInt item_id_x = get_local_id(0); 74 const CeedInt elem = get_global_id(2); 75 76 if (item_id_x < P_1D && elem < num_elem) { 77 const CeedInt node = item_id_x; 78 const CeedInt ind = indices[node + elem * P_1D]; 79 for (CeedInt comp = 0; comp < num_comp; ++comp) 80 atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device); 81 } 82 } 83 84 //------------------------------------------------------------------------------ 85 // E-vector -> L-vector, strided 86 //------------------------------------------------------------------------------ 87 inline void writeDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, 88 const CeedInt strides_elem, const CeedInt num_elem, private const CeedScalar* restrict r_v, 89 global CeedScalar* restrict d_v) { 90 const CeedInt item_id_x = get_local_id(0); 91 const CeedInt elem = get_global_id(2); 92 93 if (item_id_x < P_1D && elem < num_elem) { 94 const CeedInt node = item_id_x; 95 const CeedInt ind = node * strides_node + elem * strides_elem; 96 for (CeedInt comp = 0; comp < num_comp; comp++) { 97 d_v[ind + comp * strides_comp] = r_v[comp]; 98 } 99 } 100 } 101 102 //------------------------------------------------------------------------------ 103 // 2D 104 //------------------------------------------------------------------------------ 105 106 //------------------------------------------------------------------------------ 107 // L-vector -> E-vector, offsets provided 108 //------------------------------------------------------------------------------ 109 inline void readDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, 110 const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) { 111 const CeedInt item_id_x = get_local_id(0); 112 const CeedInt item_id_y = get_local_id(1); 113 const CeedInt elem = get_global_id(2); 114 115 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 116 const CeedInt node = item_id_x + item_id_y * P_1D; 117 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 118 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp]; 119 } 120 } 121 122 //------------------------------------------------------------------------------ 123 // L-vector -> E-vector, strided 124 //------------------------------------------------------------------------------ 125 inline void readDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, 126 const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar* restrict d_u, 127 private CeedScalar* restrict r_u) { 128 const CeedInt item_id_x = get_local_id(0); 129 const CeedInt item_id_y = get_local_id(1); 130 const CeedInt elem = get_global_id(2); 131 132 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 133 const CeedInt node = item_id_x + item_id_y * P_1D; 134 const CeedInt ind = node * strides_node + elem * strides_elem; 135 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp]; 136 } 137 } 138 139 //------------------------------------------------------------------------------ 140 // E-vector -> L-vector, offsets provided 141 //------------------------------------------------------------------------------ 142 inline void writeDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, 143 const global CeedInt* restrict indices, const private CeedScalar* restrict r_v, global CeedAtomicScalar* restrict d_v) { 144 const CeedInt item_id_x = get_local_id(0); 145 const CeedInt item_id_y = get_local_id(1); 146 const CeedInt elem = get_global_id(2); 147 148 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 149 const CeedInt node = item_id_x + item_id_y * P_1D; 150 const CeedInt ind = indices[node + elem * P_1D * P_1D]; 151 for (CeedInt comp = 0; comp < num_comp; ++comp) 152 atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device); 153 } 154 } 155 156 //------------------------------------------------------------------------------ 157 // E-vector -> L-vector, strided 158 //------------------------------------------------------------------------------ 159 inline void writeDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, 160 const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar* restrict r_v, 161 global CeedScalar* restrict d_v) { 162 const CeedInt item_id_x = get_local_id(0); 163 const CeedInt item_id_y = get_local_id(1); 164 const CeedInt elem = get_global_id(2); 165 166 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 167 const CeedInt node = item_id_x + item_id_y * P_1D; 168 const CeedInt ind = node * strides_node + elem * strides_elem; 169 for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[comp]; 170 } 171 } 172 173 //------------------------------------------------------------------------------ 174 // 3D 175 //------------------------------------------------------------------------------ 176 177 //------------------------------------------------------------------------------ 178 // L-vector -> E-vector, offsets provided 179 //------------------------------------------------------------------------------ 180 inline void readDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, 181 const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) { 182 const CeedInt item_id_x = get_local_id(0); 183 const CeedInt item_id_y = get_local_id(1); 184 const CeedInt elem = get_global_id(2); 185 186 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 187 for (CeedInt z = 0; z < P_1D; ++z) { 188 const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z); 189 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 190 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + strides_comp * comp]; 191 } 192 } 193 } 194 195 //------------------------------------------------------------------------------ 196 // L-vector -> E-vector, strided 197 //------------------------------------------------------------------------------ 198 inline void readDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, 199 const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar* restrict d_u, 200 private CeedScalar* restrict r_u) { 201 const CeedInt item_id_x = get_local_id(0); 202 const CeedInt item_id_y = get_local_id(1); 203 const CeedInt elem = get_global_id(2); 204 205 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 206 for (CeedInt z = 0; z < P_1D; ++z) { 207 const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z); 208 const CeedInt ind = node * strides_node + elem * strides_elem; 209 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp]; 210 } 211 } 212 } 213 214 //------------------------------------------------------------------------------ 215 // E-vector -> Q-vector, offests provided 216 //------------------------------------------------------------------------------ 217 inline void readSliceQuadsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt Q_1D, const CeedInt num_elem, const CeedInt q, 218 const global CeedInt* restrict indices, const global CeedScalar* restrict d_u, private CeedScalar* restrict r_u) { 219 const CeedInt item_id_x = get_local_id(0); 220 const CeedInt item_id_y = get_local_id(1); 221 const CeedInt elem = get_global_id(2); 222 223 if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) { 224 const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q); 225 const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D]; 226 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp]; 227 } 228 } 229 230 //------------------------------------------------------------------------------ 231 // E-vector -> Q-vector, strided 232 //------------------------------------------------------------------------------ 233 inline void readSliceQuadsStrided3d(const CeedInt num_comp, const CeedInt Q_1D, CeedInt strides_node, CeedInt strides_comp, CeedInt strides_elem, 234 const CeedInt num_elem, const CeedInt q, const global CeedScalar* restrict d_u, 235 private CeedScalar* restrict r_u) { 236 const CeedInt item_id_x = get_local_id(0); 237 const CeedInt item_id_y = get_local_id(1); 238 const CeedInt elem = get_global_id(2); 239 240 if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) { 241 const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q); 242 const CeedInt ind = node * strides_node + elem * strides_elem; 243 for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp]; 244 } 245 } 246 247 //------------------------------------------------------------------------------ 248 // E-vector -> L-vector, offsets provided 249 //------------------------------------------------------------------------------ 250 inline void writeDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem, 251 const global CeedInt* restrict indices, const private CeedScalar* restrict r_v, global CeedAtomicScalar* restrict d_v) { 252 const CeedInt item_id_x = get_local_id(0); 253 const CeedInt item_id_y = get_local_id(1); 254 const CeedInt elem = get_global_id(2); 255 256 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 257 for (CeedInt z = 0; z < P_1D; ++z) { 258 const CeedInt node = item_id_x + item_id_y * P_1D + z * P_1D * P_1D; 259 const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D]; 260 for (CeedInt comp = 0; comp < num_comp; ++comp) 261 atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[z + comp * P_1D], memory_order_relaxed, memory_scope_device); 262 } 263 } 264 } 265 266 //------------------------------------------------------------------------------ 267 // E-vector -> L-vector, strided 268 //------------------------------------------------------------------------------ 269 inline void writeDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp, 270 const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar* restrict r_v, 271 global CeedScalar* restrict d_v) { 272 const CeedInt item_id_x = get_local_id(0); 273 const CeedInt item_id_y = get_local_id(1); 274 const CeedInt elem = get_global_id(2); 275 276 if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) { 277 for (CeedInt z = 0; z < P_1D; ++z) { 278 const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z); 279 const CeedInt ind = node * strides_node + elem * strides_elem; 280 for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D]; 281 } 282 } 283 } 284 285 //------------------------------------------------------------------------------ 286 // 3D collocated derivatives computation 287 //------------------------------------------------------------------------------ 288 inline void gradCollo3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar* restrict r_U, 289 const local CeedScalar* s_G, private CeedScalar* restrict r_V, local CeedScalar* restrict scratch) { 290 const CeedInt item_id_x = get_local_id(0); 291 const CeedInt item_id_y = get_local_id(1); 292 293 for (CeedInt comp = 0; comp < num_comp; ++comp) { 294 if (item_id_x < Q_1D && item_id_y < Q_1D) { 295 scratch[item_id_x + item_id_y * T_1D] = r_U[q + comp * Q_1D]; 296 } 297 work_group_barrier(CLK_LOCAL_MEM_FENCE); 298 299 if (item_id_x < Q_1D && item_id_y < Q_1D) { 300 // X derivative 301 r_V[comp + 0 * num_comp] = 0.0; 302 for (CeedInt i = 0; i < Q_1D; ++i) 303 r_V[comp + 0 * num_comp] += s_G[i + item_id_x * Q_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction (X derivative) 304 305 // Y derivative 306 r_V[comp + 1 * num_comp] = 0.0; 307 for (CeedInt i = 0; i < Q_1D; ++i) 308 r_V[comp + 1 * num_comp] += s_G[i + item_id_y * Q_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction (Y derivative) 309 310 // Z derivative 311 r_V[comp + 2 * num_comp] = 0.0; 312 for (CeedInt i = 0; i < Q_1D; ++i) r_V[comp + 2 * num_comp] += s_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; // Contract z direction (Z derivative) 313 } 314 315 work_group_barrier(CLK_LOCAL_MEM_FENCE); 316 } 317 } 318 319 //------------------------------------------------------------------------------ 320 // 3D collocated derivatives transpose 321 //------------------------------------------------------------------------------ 322 inline void gradColloTranspose3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar* restrict r_U, 323 const local CeedScalar* restrict s_G, private CeedScalar* restrict r_V, local CeedScalar* restrict scratch) { 324 const CeedInt item_id_x = get_local_id(0); 325 const CeedInt item_id_y = get_local_id(1); 326 327 for (CeedInt comp = 0; comp < num_comp; ++comp) { 328 // X derivative 329 if (item_id_x < Q_1D && item_id_y < Q_1D) { 330 scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 0 * num_comp]; 331 } 332 work_group_barrier(CLK_LOCAL_MEM_FENCE); 333 334 if (item_id_x < Q_1D && item_id_y < Q_1D) { 335 for (CeedInt i = 0; i < Q_1D; ++i) 336 r_V[q + comp * Q_1D] += s_G[item_id_x + i * Q_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction (X derivative) 337 } 338 work_group_barrier(CLK_LOCAL_MEM_FENCE); 339 340 // Y derivative 341 if (item_id_x < Q_1D && item_id_y < Q_1D) { 342 scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 1 * num_comp]; 343 } 344 work_group_barrier(CLK_LOCAL_MEM_FENCE); 345 346 if (item_id_x < Q_1D && item_id_y < Q_1D) { 347 for (CeedInt i = 0; i < Q_1D; ++i) 348 r_V[q + comp * Q_1D] += s_G[item_id_y + i * Q_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction (Y derivative) 349 } 350 work_group_barrier(CLK_LOCAL_MEM_FENCE); 351 352 // Z derivative 353 if (item_id_x < Q_1D && item_id_y < Q_1D) { 354 for (CeedInt i = 0; i < Q_1D; ++i) 355 r_V[i + comp * Q_1D] += s_G[i + q * Q_1D] * r_U[comp + 2 * num_comp]; // PARTIAL contract z direction (Z derivative) 356 } 357 } 358 } 359 360 //------------------------------------------------------------------------------ 361 362 #endif // CEED_SYCL_GEN_TEMPLATES_H 363