1*9ba83ac0SJeremy L Thompson // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
26ca0f394SUmesh Unnikrishnan // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
36ca0f394SUmesh Unnikrishnan //
46ca0f394SUmesh Unnikrishnan // SPDX-License-Identifier: BSD-2-Clause
56ca0f394SUmesh Unnikrishnan //
66ca0f394SUmesh Unnikrishnan // This file is part of CEED: http://github.com/ceed
76ca0f394SUmesh Unnikrishnan
86ca0f394SUmesh Unnikrishnan /// @file
96ca0f394SUmesh Unnikrishnan /// Internal header for SYCL backend macro and type definitions for JiT source
10c0b5abf0SJeremy L Thompson #include <ceed/types.h>
116ca0f394SUmesh Unnikrishnan
126ca0f394SUmesh Unnikrishnan #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
136ca0f394SUmesh Unnikrishnan #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
146ca0f394SUmesh Unnikrishnan // TODO: Handle FP32 case
156ca0f394SUmesh Unnikrishnan typedef atomic_double CeedAtomicScalar;
166ca0f394SUmesh Unnikrishnan
176ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
186ca0f394SUmesh Unnikrishnan // Load matrices for basis actions
196ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
loadMatrix(const CeedInt N,const CeedScalar * restrict d_B,CeedScalar * restrict B)206ca0f394SUmesh Unnikrishnan inline void loadMatrix(const CeedInt N, const CeedScalar *restrict d_B, CeedScalar *restrict B) {
216ca0f394SUmesh Unnikrishnan const CeedInt item_id = get_local_linear_id();
226ca0f394SUmesh Unnikrishnan const CeedInt group_size = get_local_size(0) * get_local_size(1) * get_local_size(2);
236ca0f394SUmesh Unnikrishnan for (CeedInt i = item_id; i < N; i += group_size) B[i] = d_B[i];
246ca0f394SUmesh Unnikrishnan }
256ca0f394SUmesh Unnikrishnan
266ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
276ca0f394SUmesh Unnikrishnan // 1D
286ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
296ca0f394SUmesh Unnikrishnan
306ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
316ca0f394SUmesh Unnikrishnan // L-vector -> E-vector, offsets provided
326ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
readDofsOffset1d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt P_1D,const CeedInt num_elem,const global CeedInt * restrict indices,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)336ca0f394SUmesh Unnikrishnan inline void readDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
346ca0f394SUmesh Unnikrishnan const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
356ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
366ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
376ca0f394SUmesh Unnikrishnan
386ca0f394SUmesh Unnikrishnan if (item_id_x < P_1D && elem < num_elem) {
396ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x;
406ca0f394SUmesh Unnikrishnan const CeedInt ind = indices[node + elem * P_1D];
416ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp) {
426ca0f394SUmesh Unnikrishnan r_u[comp] = d_u[ind + strides_comp * comp];
436ca0f394SUmesh Unnikrishnan }
446ca0f394SUmesh Unnikrishnan }
456ca0f394SUmesh Unnikrishnan }
466ca0f394SUmesh Unnikrishnan
476ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
486ca0f394SUmesh Unnikrishnan // L-vector -> E-vector, strided
496ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
readDofsStrided1d(const CeedInt num_comp,const CeedInt P_1D,const CeedInt strides_node,const CeedInt strides_comp,const CeedInt strides_elem,const CeedInt num_elem,global const CeedScalar * restrict d_u,private CeedScalar * restrict r_u)506ca0f394SUmesh Unnikrishnan inline void readDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
516ca0f394SUmesh Unnikrishnan const CeedInt strides_elem, const CeedInt num_elem, global const CeedScalar *restrict d_u,
526ca0f394SUmesh Unnikrishnan private CeedScalar *restrict r_u) {
536ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
546ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
556ca0f394SUmesh Unnikrishnan
566ca0f394SUmesh Unnikrishnan if (item_id_x < P_1D && elem < num_elem) {
576ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x;
586ca0f394SUmesh Unnikrishnan const CeedInt ind = node * strides_node + elem * strides_elem;
596ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; comp++) {
606ca0f394SUmesh Unnikrishnan r_u[comp] = d_u[ind + comp * strides_comp];
616ca0f394SUmesh Unnikrishnan }
626ca0f394SUmesh Unnikrishnan }
636ca0f394SUmesh Unnikrishnan }
646ca0f394SUmesh Unnikrishnan
656ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
666ca0f394SUmesh Unnikrishnan // E-vector -> L-vector, offsets provided
676ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
writeDofsOffset1d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt P_1D,const CeedInt num_elem,const global CeedInt * restrict indices,const private CeedScalar * restrict r_v,global CeedAtomicScalar * restrict d_v)686ca0f394SUmesh Unnikrishnan inline void writeDofsOffset1d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
696ca0f394SUmesh Unnikrishnan const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) {
706ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
716ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
726ca0f394SUmesh Unnikrishnan
736ca0f394SUmesh Unnikrishnan if (item_id_x < P_1D && elem < num_elem) {
746ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x;
756ca0f394SUmesh Unnikrishnan const CeedInt ind = indices[node + elem * P_1D];
766ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp)
776ca0f394SUmesh Unnikrishnan atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device);
786ca0f394SUmesh Unnikrishnan }
796ca0f394SUmesh Unnikrishnan }
806ca0f394SUmesh Unnikrishnan
816ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
826ca0f394SUmesh Unnikrishnan // E-vector -> L-vector, strided
836ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
writeDofsStrided1d(const CeedInt num_comp,const CeedInt P_1D,const CeedInt strides_node,const CeedInt strides_comp,const CeedInt strides_elem,const CeedInt num_elem,private const CeedScalar * restrict r_v,global CeedScalar * restrict d_v)846ca0f394SUmesh Unnikrishnan inline void writeDofsStrided1d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
856ca0f394SUmesh Unnikrishnan const CeedInt strides_elem, const CeedInt num_elem, private const CeedScalar *restrict r_v,
866ca0f394SUmesh Unnikrishnan global CeedScalar *restrict d_v) {
876ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
886ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
896ca0f394SUmesh Unnikrishnan
906ca0f394SUmesh Unnikrishnan if (item_id_x < P_1D && elem < num_elem) {
916ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x;
926ca0f394SUmesh Unnikrishnan const CeedInt ind = node * strides_node + elem * strides_elem;
936ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; comp++) {
946ca0f394SUmesh Unnikrishnan d_v[ind + comp * strides_comp] = r_v[comp];
956ca0f394SUmesh Unnikrishnan }
966ca0f394SUmesh Unnikrishnan }
976ca0f394SUmesh Unnikrishnan }
986ca0f394SUmesh Unnikrishnan
996ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
1006ca0f394SUmesh Unnikrishnan // 2D
1016ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
1026ca0f394SUmesh Unnikrishnan
1036ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
1046ca0f394SUmesh Unnikrishnan // L-vector -> E-vector, offsets provided
1056ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
readDofsOffset2d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt P_1D,const CeedInt num_elem,const global CeedInt * restrict indices,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)1066ca0f394SUmesh Unnikrishnan inline void readDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
1076ca0f394SUmesh Unnikrishnan const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
1086ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
1096ca0f394SUmesh Unnikrishnan const CeedInt item_id_y = get_local_id(1);
1106ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
1116ca0f394SUmesh Unnikrishnan
1126ca0f394SUmesh Unnikrishnan if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
1136ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x + item_id_y * P_1D;
1146ca0f394SUmesh Unnikrishnan const CeedInt ind = indices[node + elem * P_1D * P_1D];
1156ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp];
1166ca0f394SUmesh Unnikrishnan }
1176ca0f394SUmesh Unnikrishnan }
1186ca0f394SUmesh Unnikrishnan
1196ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
1206ca0f394SUmesh Unnikrishnan // L-vector -> E-vector, strided
1216ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
readDofsStrided2d(const CeedInt num_comp,const CeedInt P_1D,const CeedInt strides_node,const CeedInt strides_comp,const CeedInt strides_elem,const CeedInt num_elem,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)1226ca0f394SUmesh Unnikrishnan inline void readDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
1236ca0f394SUmesh Unnikrishnan const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar *restrict d_u,
1246ca0f394SUmesh Unnikrishnan private CeedScalar *restrict r_u) {
1256ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
1266ca0f394SUmesh Unnikrishnan const CeedInt item_id_y = get_local_id(1);
1276ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
1286ca0f394SUmesh Unnikrishnan
1296ca0f394SUmesh Unnikrishnan if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
1306ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x + item_id_y * P_1D;
1316ca0f394SUmesh Unnikrishnan const CeedInt ind = node * strides_node + elem * strides_elem;
1326ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp];
1336ca0f394SUmesh Unnikrishnan }
1346ca0f394SUmesh Unnikrishnan }
1356ca0f394SUmesh Unnikrishnan
1366ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
1376ca0f394SUmesh Unnikrishnan // E-vector -> L-vector, offsets provided
1386ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
writeDofsOffset2d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt P_1D,const CeedInt num_elem,const global CeedInt * restrict indices,const private CeedScalar * restrict r_v,global CeedAtomicScalar * restrict d_v)1396ca0f394SUmesh Unnikrishnan inline void writeDofsOffset2d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
1406ca0f394SUmesh Unnikrishnan const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) {
1416ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
1426ca0f394SUmesh Unnikrishnan const CeedInt item_id_y = get_local_id(1);
1436ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
1446ca0f394SUmesh Unnikrishnan
1456ca0f394SUmesh Unnikrishnan if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
1466ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x + item_id_y * P_1D;
1476ca0f394SUmesh Unnikrishnan const CeedInt ind = indices[node + elem * P_1D * P_1D];
1486ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp)
1496ca0f394SUmesh Unnikrishnan atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[comp], memory_order_relaxed, memory_scope_device);
1506ca0f394SUmesh Unnikrishnan }
1516ca0f394SUmesh Unnikrishnan }
1526ca0f394SUmesh Unnikrishnan
1536ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
1546ca0f394SUmesh Unnikrishnan // E-vector -> L-vector, strided
1556ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
writeDofsStrided2d(const CeedInt num_comp,const CeedInt P_1D,const CeedInt strides_node,const CeedInt strides_comp,const CeedInt strides_elem,const CeedInt num_elem,const private CeedScalar * restrict r_v,global CeedScalar * restrict d_v)1566ca0f394SUmesh Unnikrishnan inline void writeDofsStrided2d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
1576ca0f394SUmesh Unnikrishnan const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar *restrict r_v,
1586ca0f394SUmesh Unnikrishnan global CeedScalar *restrict d_v) {
1596ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
1606ca0f394SUmesh Unnikrishnan const CeedInt item_id_y = get_local_id(1);
1616ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
1626ca0f394SUmesh Unnikrishnan
1636ca0f394SUmesh Unnikrishnan if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
1646ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x + item_id_y * P_1D;
1656ca0f394SUmesh Unnikrishnan const CeedInt ind = node * strides_node + elem * strides_elem;
1666ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[comp];
1676ca0f394SUmesh Unnikrishnan }
1686ca0f394SUmesh Unnikrishnan }
1696ca0f394SUmesh Unnikrishnan
1706ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
1716ca0f394SUmesh Unnikrishnan // 3D
1726ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
1736ca0f394SUmesh Unnikrishnan
1746ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
1756ca0f394SUmesh Unnikrishnan // L-vector -> E-vector, offsets provided
1766ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
readDofsOffset3d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt P_1D,const CeedInt num_elem,const global CeedInt * restrict indices,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)1776ca0f394SUmesh Unnikrishnan inline void readDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
1786ca0f394SUmesh Unnikrishnan const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
1796ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
1806ca0f394SUmesh Unnikrishnan const CeedInt item_id_y = get_local_id(1);
1816ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
1826ca0f394SUmesh Unnikrishnan
1836ca0f394SUmesh Unnikrishnan if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
1846ca0f394SUmesh Unnikrishnan for (CeedInt z = 0; z < P_1D; ++z) {
1856ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
1866ca0f394SUmesh Unnikrishnan const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D];
1876ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + strides_comp * comp];
1886ca0f394SUmesh Unnikrishnan }
1896ca0f394SUmesh Unnikrishnan }
1906ca0f394SUmesh Unnikrishnan }
1916ca0f394SUmesh Unnikrishnan
1926ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
1936ca0f394SUmesh Unnikrishnan // L-vector -> E-vector, strided
1946ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
readDofsStrided3d(const CeedInt num_comp,const CeedInt P_1D,const CeedInt strides_node,const CeedInt strides_comp,const CeedInt strides_elem,const CeedInt num_elem,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)1956ca0f394SUmesh Unnikrishnan inline void readDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
1966ca0f394SUmesh Unnikrishnan const CeedInt strides_elem, const CeedInt num_elem, const global CeedScalar *restrict d_u,
1976ca0f394SUmesh Unnikrishnan private CeedScalar *restrict r_u) {
1986ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
1996ca0f394SUmesh Unnikrishnan const CeedInt item_id_y = get_local_id(1);
2006ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
2016ca0f394SUmesh Unnikrishnan
2026ca0f394SUmesh Unnikrishnan if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
2036ca0f394SUmesh Unnikrishnan for (CeedInt z = 0; z < P_1D; ++z) {
2046ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
2056ca0f394SUmesh Unnikrishnan const CeedInt ind = node * strides_node + elem * strides_elem;
2066ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[z + comp * P_1D] = d_u[ind + comp * strides_comp];
2076ca0f394SUmesh Unnikrishnan }
2086ca0f394SUmesh Unnikrishnan }
2096ca0f394SUmesh Unnikrishnan }
2106ca0f394SUmesh Unnikrishnan
2116ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
2126ca0f394SUmesh Unnikrishnan // E-vector -> Q-vector, offests provided
2136ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
readSliceQuadsOffset3d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt Q_1D,const CeedInt num_elem,const CeedInt q,const global CeedInt * restrict indices,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)2146ca0f394SUmesh Unnikrishnan inline void readSliceQuadsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt Q_1D, const CeedInt num_elem, const CeedInt q,
2156ca0f394SUmesh Unnikrishnan const global CeedInt *restrict indices, const global CeedScalar *restrict d_u, private CeedScalar *restrict r_u) {
2166ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
2176ca0f394SUmesh Unnikrishnan const CeedInt item_id_y = get_local_id(1);
2186ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
2196ca0f394SUmesh Unnikrishnan
2206ca0f394SUmesh Unnikrishnan if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) {
2216ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q);
2226ca0f394SUmesh Unnikrishnan const CeedInt ind = indices[node + elem * Q_1D * Q_1D * Q_1D];
2236ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + strides_comp * comp];
2246ca0f394SUmesh Unnikrishnan }
2256ca0f394SUmesh Unnikrishnan }
2266ca0f394SUmesh Unnikrishnan
2276ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
2286ca0f394SUmesh Unnikrishnan // E-vector -> Q-vector, strided
2296ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
readSliceQuadsStrided3d(const CeedInt num_comp,const CeedInt Q_1D,CeedInt strides_node,CeedInt strides_comp,CeedInt strides_elem,const CeedInt num_elem,const CeedInt q,const global CeedScalar * restrict d_u,private CeedScalar * restrict r_u)2306ca0f394SUmesh Unnikrishnan inline void readSliceQuadsStrided3d(const CeedInt num_comp, const CeedInt Q_1D, CeedInt strides_node, CeedInt strides_comp, CeedInt strides_elem,
2316ca0f394SUmesh Unnikrishnan const CeedInt num_elem, const CeedInt q, const global CeedScalar *restrict d_u,
2326ca0f394SUmesh Unnikrishnan private CeedScalar *restrict r_u) {
2336ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
2346ca0f394SUmesh Unnikrishnan const CeedInt item_id_y = get_local_id(1);
2356ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
2366ca0f394SUmesh Unnikrishnan
2376ca0f394SUmesh Unnikrishnan if (item_id_x < Q_1D && item_id_y < Q_1D && elem < num_elem) {
2386ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x + Q_1D * (item_id_y + Q_1D * q);
2396ca0f394SUmesh Unnikrishnan const CeedInt ind = node * strides_node + elem * strides_elem;
2406ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp) r_u[comp] = d_u[ind + comp * strides_comp];
2416ca0f394SUmesh Unnikrishnan }
2426ca0f394SUmesh Unnikrishnan }
2436ca0f394SUmesh Unnikrishnan
2446ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
2456ca0f394SUmesh Unnikrishnan // E-vector -> L-vector, offsets provided
2466ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
writeDofsOffset3d(const CeedInt num_comp,const CeedInt strides_comp,const CeedInt P_1D,const CeedInt num_elem,const global CeedInt * restrict indices,const private CeedScalar * restrict r_v,global CeedAtomicScalar * restrict d_v)2476ca0f394SUmesh Unnikrishnan inline void writeDofsOffset3d(const CeedInt num_comp, const CeedInt strides_comp, const CeedInt P_1D, const CeedInt num_elem,
2486ca0f394SUmesh Unnikrishnan const global CeedInt *restrict indices, const private CeedScalar *restrict r_v, global CeedAtomicScalar *restrict d_v) {
2496ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
2506ca0f394SUmesh Unnikrishnan const CeedInt item_id_y = get_local_id(1);
2516ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
2526ca0f394SUmesh Unnikrishnan
2536ca0f394SUmesh Unnikrishnan if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
2546ca0f394SUmesh Unnikrishnan for (CeedInt z = 0; z < P_1D; ++z) {
2556ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x + item_id_y * P_1D + z * P_1D * P_1D;
2566ca0f394SUmesh Unnikrishnan const CeedInt ind = indices[node + elem * P_1D * P_1D * P_1D];
2576ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp)
2586ca0f394SUmesh Unnikrishnan atomic_fetch_add_explicit(&d_v[ind + strides_comp * comp], r_v[z + comp * P_1D], memory_order_relaxed, memory_scope_device);
2596ca0f394SUmesh Unnikrishnan }
2606ca0f394SUmesh Unnikrishnan }
2616ca0f394SUmesh Unnikrishnan }
2626ca0f394SUmesh Unnikrishnan
2636ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
2646ca0f394SUmesh Unnikrishnan // E-vector -> L-vector, strided
2656ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
writeDofsStrided3d(const CeedInt num_comp,const CeedInt P_1D,const CeedInt strides_node,const CeedInt strides_comp,const CeedInt strides_elem,const CeedInt num_elem,const private CeedScalar * restrict r_v,global CeedScalar * restrict d_v)2666ca0f394SUmesh Unnikrishnan inline void writeDofsStrided3d(const CeedInt num_comp, const CeedInt P_1D, const CeedInt strides_node, const CeedInt strides_comp,
2676ca0f394SUmesh Unnikrishnan const CeedInt strides_elem, const CeedInt num_elem, const private CeedScalar *restrict r_v,
2686ca0f394SUmesh Unnikrishnan global CeedScalar *restrict d_v) {
2696ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
2706ca0f394SUmesh Unnikrishnan const CeedInt item_id_y = get_local_id(1);
2716ca0f394SUmesh Unnikrishnan const CeedInt elem = get_global_id(2);
2726ca0f394SUmesh Unnikrishnan
2736ca0f394SUmesh Unnikrishnan if (item_id_x < P_1D && item_id_y < P_1D && elem < num_elem) {
2746ca0f394SUmesh Unnikrishnan for (CeedInt z = 0; z < P_1D; ++z) {
2756ca0f394SUmesh Unnikrishnan const CeedInt node = item_id_x + P_1D * (item_id_y + P_1D * z);
2766ca0f394SUmesh Unnikrishnan const CeedInt ind = node * strides_node + elem * strides_elem;
2776ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp) d_v[ind + comp * strides_comp] += r_v[z + comp * P_1D];
2786ca0f394SUmesh Unnikrishnan }
2796ca0f394SUmesh Unnikrishnan }
2806ca0f394SUmesh Unnikrishnan }
2816ca0f394SUmesh Unnikrishnan
2826ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
2836ca0f394SUmesh Unnikrishnan // 3D collocated derivatives computation
2846ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
gradCollo3d(const CeedInt num_comp,const CeedInt Q_1D,const CeedInt q,const private CeedScalar * restrict r_U,const local CeedScalar * s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)2856ca0f394SUmesh Unnikrishnan inline void gradCollo3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar *restrict r_U,
2866ca0f394SUmesh Unnikrishnan const local CeedScalar *s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
2876ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
2886ca0f394SUmesh Unnikrishnan const CeedInt item_id_y = get_local_id(1);
2896ca0f394SUmesh Unnikrishnan
2906ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp) {
2916ca0f394SUmesh Unnikrishnan if (item_id_x < Q_1D && item_id_y < Q_1D) {
2926ca0f394SUmesh Unnikrishnan scratch[item_id_x + item_id_y * T_1D] = r_U[q + comp * Q_1D];
2936ca0f394SUmesh Unnikrishnan }
2946ca0f394SUmesh Unnikrishnan work_group_barrier(CLK_LOCAL_MEM_FENCE);
2956ca0f394SUmesh Unnikrishnan
2966ca0f394SUmesh Unnikrishnan if (item_id_x < Q_1D && item_id_y < Q_1D) {
2976ca0f394SUmesh Unnikrishnan // X derivative
2986ca0f394SUmesh Unnikrishnan r_V[comp + 0 * num_comp] = 0.0;
2996ca0f394SUmesh Unnikrishnan for (CeedInt i = 0; i < Q_1D; ++i)
3006ca0f394SUmesh Unnikrishnan r_V[comp + 0 * num_comp] += s_G[i + item_id_x * Q_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction (X derivative)
3016ca0f394SUmesh Unnikrishnan
3026ca0f394SUmesh Unnikrishnan // Y derivative
3036ca0f394SUmesh Unnikrishnan r_V[comp + 1 * num_comp] = 0.0;
3046ca0f394SUmesh Unnikrishnan for (CeedInt i = 0; i < Q_1D; ++i)
3056ca0f394SUmesh Unnikrishnan r_V[comp + 1 * num_comp] += s_G[i + item_id_y * Q_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction (Y derivative)
3066ca0f394SUmesh Unnikrishnan
3076ca0f394SUmesh Unnikrishnan // Z derivative
3086ca0f394SUmesh Unnikrishnan r_V[comp + 2 * num_comp] = 0.0;
3096ca0f394SUmesh Unnikrishnan for (CeedInt i = 0; i < Q_1D; ++i) r_V[comp + 2 * num_comp] += s_G[i + q * Q_1D] * r_U[i + comp * Q_1D]; // Contract z direction (Z derivative)
3106ca0f394SUmesh Unnikrishnan }
3116ca0f394SUmesh Unnikrishnan
3126ca0f394SUmesh Unnikrishnan work_group_barrier(CLK_LOCAL_MEM_FENCE);
3136ca0f394SUmesh Unnikrishnan }
3146ca0f394SUmesh Unnikrishnan }
3156ca0f394SUmesh Unnikrishnan
3166ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
3176ca0f394SUmesh Unnikrishnan // 3D collocated derivatives transpose
3186ca0f394SUmesh Unnikrishnan //------------------------------------------------------------------------------
gradColloTranspose3d(const CeedInt num_comp,const CeedInt Q_1D,const CeedInt q,const private CeedScalar * restrict r_U,const local CeedScalar * restrict s_G,private CeedScalar * restrict r_V,local CeedScalar * restrict scratch)3196ca0f394SUmesh Unnikrishnan inline void gradColloTranspose3d(const CeedInt num_comp, const CeedInt Q_1D, const CeedInt q, const private CeedScalar *restrict r_U,
3206ca0f394SUmesh Unnikrishnan const local CeedScalar *restrict s_G, private CeedScalar *restrict r_V, local CeedScalar *restrict scratch) {
3216ca0f394SUmesh Unnikrishnan const CeedInt item_id_x = get_local_id(0);
3226ca0f394SUmesh Unnikrishnan const CeedInt item_id_y = get_local_id(1);
3236ca0f394SUmesh Unnikrishnan
3246ca0f394SUmesh Unnikrishnan for (CeedInt comp = 0; comp < num_comp; ++comp) {
3256ca0f394SUmesh Unnikrishnan // X derivative
3266ca0f394SUmesh Unnikrishnan if (item_id_x < Q_1D && item_id_y < Q_1D) {
3276ca0f394SUmesh Unnikrishnan scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 0 * num_comp];
3286ca0f394SUmesh Unnikrishnan }
3296ca0f394SUmesh Unnikrishnan work_group_barrier(CLK_LOCAL_MEM_FENCE);
3306ca0f394SUmesh Unnikrishnan
3316ca0f394SUmesh Unnikrishnan if (item_id_x < Q_1D && item_id_y < Q_1D) {
3326ca0f394SUmesh Unnikrishnan for (CeedInt i = 0; i < Q_1D; ++i)
3336ca0f394SUmesh Unnikrishnan r_V[q + comp * Q_1D] += s_G[item_id_x + i * Q_1D] * scratch[i + item_id_y * T_1D]; // Contract x direction (X derivative)
3346ca0f394SUmesh Unnikrishnan }
3356ca0f394SUmesh Unnikrishnan work_group_barrier(CLK_LOCAL_MEM_FENCE);
3366ca0f394SUmesh Unnikrishnan
3376ca0f394SUmesh Unnikrishnan // Y derivative
3386ca0f394SUmesh Unnikrishnan if (item_id_x < Q_1D && item_id_y < Q_1D) {
3396ca0f394SUmesh Unnikrishnan scratch[item_id_x + item_id_y * T_1D] = r_U[comp + 1 * num_comp];
3406ca0f394SUmesh Unnikrishnan }
3416ca0f394SUmesh Unnikrishnan work_group_barrier(CLK_LOCAL_MEM_FENCE);
3426ca0f394SUmesh Unnikrishnan
3436ca0f394SUmesh Unnikrishnan if (item_id_x < Q_1D && item_id_y < Q_1D) {
3446ca0f394SUmesh Unnikrishnan for (CeedInt i = 0; i < Q_1D; ++i)
3456ca0f394SUmesh Unnikrishnan r_V[q + comp * Q_1D] += s_G[item_id_y + i * Q_1D] * scratch[item_id_x + i * T_1D]; // Contract y direction (Y derivative)
3466ca0f394SUmesh Unnikrishnan }
3476ca0f394SUmesh Unnikrishnan work_group_barrier(CLK_LOCAL_MEM_FENCE);
3486ca0f394SUmesh Unnikrishnan
3496ca0f394SUmesh Unnikrishnan // Z derivative
3506ca0f394SUmesh Unnikrishnan if (item_id_x < Q_1D && item_id_y < Q_1D) {
3516ca0f394SUmesh Unnikrishnan for (CeedInt i = 0; i < Q_1D; ++i)
3526ca0f394SUmesh Unnikrishnan r_V[i + comp * Q_1D] += s_G[i + q * Q_1D] * r_U[comp + 2 * num_comp]; // PARTIAL contract z direction (Z derivative)
3536ca0f394SUmesh Unnikrishnan }
3546ca0f394SUmesh Unnikrishnan }
3556ca0f394SUmesh Unnikrishnan }
356