19e82028bSJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. 29e82028bSJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 39e82028bSJeremy L Thompson // 49e82028bSJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause 59e82028bSJeremy L Thompson // 69e82028bSJeremy L Thompson // This file is part of CEED: http://github.com/ceed 79e82028bSJeremy L Thompson 89e82028bSJeremy L Thompson #include <ceed.h> 99e82028bSJeremy L Thompson #include <ceed/backend.h> 109e82028bSJeremy L Thompson #include <stdbool.h> 119e82028bSJeremy L Thompson #include <stdlib.h> 129e82028bSJeremy L Thompson #include <string.h> 139e82028bSJeremy L Thompson 149e82028bSJeremy L Thompson #include "ceed-memcheck.h" 159e82028bSJeremy L Thompson 169e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 179e82028bSJeremy L Thompson // Set backend strides 189e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 199e82028bSJeremy L Thompson static inline int CeedElemRestrictionGetBackendStrides_Memcheck(CeedElemRestriction rstr, CeedInt strides[3]) { 209e82028bSJeremy L Thompson CeedInt elem_size, num_comp, num_elem; 219e82028bSJeremy L Thompson 229e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 239e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 249e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 259e82028bSJeremy L Thompson // Memcheck default, contiguous by component, then node 269e82028bSJeremy L Thompson strides[0] = num_comp; 279e82028bSJeremy L Thompson strides[1] = 1; 289e82028bSJeremy L Thompson strides[2] = num_comp * elem_size; 299e82028bSJeremy L Thompson /** 309e82028bSJeremy L Thompson // CPU default, contiguous by node, then component 319e82028bSJeremy L Thompson strides[0] = 1; 329e82028bSJeremy L Thompson strides[1] = elem_size; 339e82028bSJeremy L Thompson strides[2] = elem_size * num_comp; 349e82028bSJeremy L Thompson 359e82028bSJeremy L Thompson // GPU default, contiguous by node, then element 369e82028bSJeremy L Thompson strides[0] = 1; 379e82028bSJeremy L Thompson strides[1] = num_elem * elem_size; 389e82028bSJeremy L Thompson strides[2] = elem_size; 399e82028bSJeremy L Thompson **/ 409e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 419e82028bSJeremy L Thompson } 429e82028bSJeremy L Thompson 439e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 449e82028bSJeremy L Thompson // Core ElemRestriction Apply Code 459e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 469e82028bSJeremy L Thompson static inline int CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, 479e82028bSJeremy L Thompson CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, 48740363ccSJeremy L Thompson CeedSize v_offset, const CeedScalar *__restrict__ uu, 499e82028bSJeremy L Thompson CeedScalar *__restrict__ vv) { 509e82028bSJeremy L Thompson // Get strides 519e82028bSJeremy L Thompson bool has_backend_strides; 529e82028bSJeremy L Thompson CeedInt strides[3] = {0}; 539e82028bSJeremy L Thompson 549e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); 559e82028bSJeremy L Thompson if (has_backend_strides) CeedCallBackend(CeedElemRestrictionGetBackendStrides_Memcheck(rstr, strides)); 5656c48462SJeremy L Thompson else CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides)); 579e82028bSJeremy L Thompson 589e82028bSJeremy L Thompson // Apply restriction 59*33e3c889SJeremy L Thompson for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 60*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { 61*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) { 62*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 63*33e3c889SJeremy L Thompson vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 64*33e3c889SJeremy L Thompson uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * (CeedSize)strides[2]]; 659e82028bSJeremy L Thompson } 669e82028bSJeremy L Thompson } 679e82028bSJeremy L Thompson } 689e82028bSJeremy L Thompson } 699e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 709e82028bSJeremy L Thompson } 719e82028bSJeremy L Thompson 729e82028bSJeremy L Thompson static inline int CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, 739e82028bSJeremy L Thompson const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, 74740363ccSJeremy L Thompson CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, 759e82028bSJeremy L Thompson CeedScalar *__restrict__ vv) { 769e82028bSJeremy L Thompson // Default restriction with offsets 779e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 789e82028bSJeremy L Thompson 799e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 80*33e3c889SJeremy L Thompson for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 81*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { 82*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) { 83*33e3c889SJeremy L Thompson vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride]; 849e82028bSJeremy L Thompson } 859e82028bSJeremy L Thompson } 869e82028bSJeremy L Thompson } 879e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 889e82028bSJeremy L Thompson } 899e82028bSJeremy L Thompson 909e82028bSJeremy L Thompson static inline int CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, 919e82028bSJeremy L Thompson const CeedInt block_size, const CeedInt comp_stride, CeedInt start, 92740363ccSJeremy L Thompson CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset, 939e82028bSJeremy L Thompson const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { 949e82028bSJeremy L Thompson // Restriction with orientations 959e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 969e82028bSJeremy L Thompson 979e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 98*33e3c889SJeremy L Thompson for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 99*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { 100*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) { 101*33e3c889SJeremy L Thompson vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = 1029e82028bSJeremy L Thompson uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0); 1039e82028bSJeremy L Thompson } 1049e82028bSJeremy L Thompson } 1059e82028bSJeremy L Thompson } 1069e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 1079e82028bSJeremy L Thompson } 1089e82028bSJeremy L Thompson 1099e82028bSJeremy L Thompson static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, 1109e82028bSJeremy L Thompson const CeedInt block_size, const CeedInt comp_stride, CeedInt start, 111740363ccSJeremy L Thompson CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset, 1129e82028bSJeremy L Thompson const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { 1139e82028bSJeremy L Thompson // Restriction with tridiagonal transformation 1149e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 1159e82028bSJeremy L Thompson 1169e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 117*33e3c889SJeremy L Thompson for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 118*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { 119*33e3c889SJeremy L Thompson CeedSize n = 0; 1209e82028bSJeremy L Thompson 121*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 122*33e3c889SJeremy L Thompson vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 1239e82028bSJeremy L Thompson uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 1249e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] + 1259e82028bSJeremy L Thompson uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * 1269e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]; 1279e82028bSJeremy L Thompson } 1289e82028bSJeremy L Thompson CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) { 129*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 130*33e3c889SJeremy L Thompson vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 1319e82028bSJeremy L Thompson uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * 1329e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size] + 1339e82028bSJeremy L Thompson uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 1349e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] + 1359e82028bSJeremy L Thompson uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * 1369e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]; 1379e82028bSJeremy L Thompson } 1389e82028bSJeremy L Thompson } 139*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 140*33e3c889SJeremy L Thompson vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 1419e82028bSJeremy L Thompson uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * 1429e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size] + 1439e82028bSJeremy L Thompson uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 1449e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]; 1459e82028bSJeremy L Thompson } 1469e82028bSJeremy L Thompson } 1479e82028bSJeremy L Thompson } 1489e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 1499e82028bSJeremy L Thompson } 1509e82028bSJeremy L Thompson 1519e82028bSJeremy L Thompson static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memcheck_Core( 1529e82028bSJeremy L Thompson CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, 153740363ccSJeremy L Thompson CeedInt num_elem, CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { 1549e82028bSJeremy L Thompson // Restriction with (unsigned) tridiagonal transformation 1559e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 1569e82028bSJeremy L Thompson 1579e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 158*33e3c889SJeremy L Thompson for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 159*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { 160*33e3c889SJeremy L Thompson CeedSize n = 0; 1619e82028bSJeremy L Thompson 162*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 163*33e3c889SJeremy L Thompson vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 1649e82028bSJeremy L Thompson uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 1659e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) + 1669e82028bSJeremy L Thompson uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * 1679e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]); 1689e82028bSJeremy L Thompson } 1699e82028bSJeremy L Thompson CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) { 170*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 171*33e3c889SJeremy L Thompson vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 1729e82028bSJeremy L Thompson uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * 1739e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size]) + 1749e82028bSJeremy L Thompson uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 1759e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) + 1769e82028bSJeremy L Thompson uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * 1779e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]); 1789e82028bSJeremy L Thompson } 1799e82028bSJeremy L Thompson } 180*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 181*33e3c889SJeremy L Thompson vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 1829e82028bSJeremy L Thompson uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * 1839e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size]) + 1849e82028bSJeremy L Thompson uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 1859e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]); 1869e82028bSJeremy L Thompson } 1879e82028bSJeremy L Thompson } 1889e82028bSJeremy L Thompson } 1899e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 1909e82028bSJeremy L Thompson } 1919e82028bSJeremy L Thompson 1929e82028bSJeremy L Thompson static inline int CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, 1939e82028bSJeremy L Thompson CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, 194740363ccSJeremy L Thompson CeedSize v_offset, const CeedScalar *__restrict__ uu, 1959e82028bSJeremy L Thompson CeedScalar *__restrict__ vv) { 1969e82028bSJeremy L Thompson // Get strides 1979e82028bSJeremy L Thompson bool has_backend_strides; 1989e82028bSJeremy L Thompson CeedInt strides[3] = {0}; 1999e82028bSJeremy L Thompson 2009e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); 2019e82028bSJeremy L Thompson if (has_backend_strides) CeedCallBackend(CeedElemRestrictionGetBackendStrides_Memcheck(rstr, strides)); 20256c48462SJeremy L Thompson else CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides)); 2039e82028bSJeremy L Thompson 2049e82028bSJeremy L Thompson // Apply restriction 205*33e3c889SJeremy L Thompson for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 206*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { 207*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) { 208*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) { 209*33e3c889SJeremy L Thompson vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] += 210*33e3c889SJeremy L Thompson uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset]; 2119e82028bSJeremy L Thompson } 2129e82028bSJeremy L Thompson } 2139e82028bSJeremy L Thompson } 2149e82028bSJeremy L Thompson } 2159e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 2169e82028bSJeremy L Thompson } 2179e82028bSJeremy L Thompson 2189e82028bSJeremy L Thompson static inline int CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, 2199e82028bSJeremy L Thompson const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, 220740363ccSJeremy L Thompson CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, 2219e82028bSJeremy L Thompson CeedScalar *__restrict__ vv) { 2229e82028bSJeremy L Thompson // Default restriction with offsets 2239e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 2249e82028bSJeremy L Thompson 2259e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 226*33e3c889SJeremy L Thompson for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 227*33e3c889SJeremy L Thompson for (CeedSize k = 0; k < num_comp; k++) { 228*33e3c889SJeremy L Thompson for (CeedSize i = 0; i < elem_size * block_size; i += block_size) { 2299e82028bSJeremy L Thompson // Iteration bound set to discard padding elements 230*33e3c889SJeremy L Thompson for (CeedSize j = i; j < i + CeedIntMin(block_size, num_elem - e); j++) { 2319e82028bSJeremy L Thompson CeedScalar vv_loc; 2329e82028bSJeremy L Thompson 233*33e3c889SJeremy L Thompson vv_loc = uu[elem_size * (k * block_size + e * num_comp) + j - v_offset]; 2349e82028bSJeremy L Thompson CeedPragmaAtomic vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc; 2359e82028bSJeremy L Thompson } 2369e82028bSJeremy L Thompson } 2379e82028bSJeremy L Thompson } 2389e82028bSJeremy L Thompson } 2399e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 2409e82028bSJeremy L Thompson } 2419e82028bSJeremy L Thompson 2429e82028bSJeremy L Thompson static inline int CeedElemRestrictionApplyOrientedTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, 2439e82028bSJeremy L Thompson const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, 244740363ccSJeremy L Thompson CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, 2459e82028bSJeremy L Thompson CeedScalar *__restrict__ vv) { 2469e82028bSJeremy L Thompson // Restriction with orientations 2479e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 2489e82028bSJeremy L Thompson 2499e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 250*33e3c889SJeremy L Thompson for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 251*33e3c889SJeremy L Thompson for (CeedSize k = 0; k < num_comp; k++) { 252*33e3c889SJeremy L Thompson for (CeedSize i = 0; i < elem_size * block_size; i += block_size) { 2539e82028bSJeremy L Thompson // Iteration bound set to discard padding elements 254*33e3c889SJeremy L Thompson for (CeedSize j = i; j < i + CeedIntMin(block_size, num_elem - e); j++) { 2559e82028bSJeremy L Thompson CeedScalar vv_loc; 2569e82028bSJeremy L Thompson 257*33e3c889SJeremy L Thompson vv_loc = uu[elem_size * (k * block_size + e * num_comp) + j - v_offset] * (impl->orients[j + e * elem_size] ? -1.0 : 1.0); 2589e82028bSJeremy L Thompson CeedPragmaAtomic vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc; 2599e82028bSJeremy L Thompson } 2609e82028bSJeremy L Thompson } 2619e82028bSJeremy L Thompson } 2629e82028bSJeremy L Thompson } 2639e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 2649e82028bSJeremy L Thompson } 2659e82028bSJeremy L Thompson 2669e82028bSJeremy L Thompson static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, 2679e82028bSJeremy L Thompson const CeedInt block_size, const CeedInt comp_stride, CeedInt start, 268740363ccSJeremy L Thompson CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset, 2699e82028bSJeremy L Thompson const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { 2709e82028bSJeremy L Thompson // Restriction with tridiagonal transformation 2719e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 2729e82028bSJeremy L Thompson CeedScalar vv_loc[block_size]; 2739e82028bSJeremy L Thompson 2749e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 275*33e3c889SJeremy L Thompson for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 276*33e3c889SJeremy L Thompson for (CeedSize k = 0; k < num_comp; k++) { 2779e82028bSJeremy L Thompson // Iteration bound set to discard padding elements 278*33e3c889SJeremy L Thompson const CeedSize block_end = CeedIntMin(block_size, num_elem - e); 279*33e3c889SJeremy L Thompson CeedSize n = 0; 2809e82028bSJeremy L Thompson 281*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < block_end; j++) { 282*33e3c889SJeremy L Thompson vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] * 2839e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] + 284*33e3c889SJeremy L Thompson uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] * 2859e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]; 2869e82028bSJeremy L Thompson } 287*33e3c889SJeremy L Thompson for (CeedSize j = 0; j < block_end; j++) { 2889e82028bSJeremy L Thompson CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; 2899e82028bSJeremy L Thompson } 2909e82028bSJeremy L Thompson for (n = 1; n < elem_size - 1; n++) { 291*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < block_end; j++) { 292*33e3c889SJeremy L Thompson vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] * 2939e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size] + 294*33e3c889SJeremy L Thompson uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] * 2959e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] + 296*33e3c889SJeremy L Thompson uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] * 2979e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]; 2989e82028bSJeremy L Thompson } 299*33e3c889SJeremy L Thompson for (CeedSize j = 0; j < block_end; j++) { 3009e82028bSJeremy L Thompson CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; 3019e82028bSJeremy L Thompson } 3029e82028bSJeremy L Thompson } 303*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < block_end; j++) { 304*33e3c889SJeremy L Thompson vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] * 3059e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size] + 306*33e3c889SJeremy L Thompson uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] * 3079e82028bSJeremy L Thompson impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]; 3089e82028bSJeremy L Thompson } 309*33e3c889SJeremy L Thompson for (CeedSize j = 0; j < block_end; j++) { 3109e82028bSJeremy L Thompson CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; 3119e82028bSJeremy L Thompson } 3129e82028bSJeremy L Thompson } 3139e82028bSJeremy L Thompson } 3149e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 3159e82028bSJeremy L Thompson } 3169e82028bSJeremy L Thompson 3179e82028bSJeremy L Thompson static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Memcheck_Core( 3189e82028bSJeremy L Thompson CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop, 319740363ccSJeremy L Thompson CeedInt num_elem, CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { 3209e82028bSJeremy L Thompson // Restriction with (unsigned) tridiagonal transformation 3219e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 3229e82028bSJeremy L Thompson CeedScalar vv_loc[block_size]; 3239e82028bSJeremy L Thompson 3249e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 325*33e3c889SJeremy L Thompson for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 326*33e3c889SJeremy L Thompson for (CeedSize k = 0; k < num_comp; k++) { 3279e82028bSJeremy L Thompson // Iteration bound set to discard padding elements 328*33e3c889SJeremy L Thompson const CeedSize block_end = CeedIntMin(block_size, num_elem - e); 329*33e3c889SJeremy L Thompson CeedSize n = 0; 3309e82028bSJeremy L Thompson 331*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < block_end; j++) { 332*33e3c889SJeremy L Thompson vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] * 3339e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) + 334*33e3c889SJeremy L Thompson uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] * 3359e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]); 3369e82028bSJeremy L Thompson } 337*33e3c889SJeremy L Thompson for (CeedSize j = 0; j < block_end; j++) { 3389e82028bSJeremy L Thompson CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; 3399e82028bSJeremy L Thompson } 3409e82028bSJeremy L Thompson for (n = 1; n < elem_size - 1; n++) { 341*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < block_end; j++) { 342*33e3c889SJeremy L Thompson vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] * 3439e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size]) + 344*33e3c889SJeremy L Thompson uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] * 3459e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) + 346*33e3c889SJeremy L Thompson uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] * 3479e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]); 3489e82028bSJeremy L Thompson } 349*33e3c889SJeremy L Thompson for (CeedSize j = 0; j < block_end; j++) { 3509e82028bSJeremy L Thompson CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; 3519e82028bSJeremy L Thompson } 3529e82028bSJeremy L Thompson } 353*33e3c889SJeremy L Thompson CeedPragmaSIMD for (CeedSize j = 0; j < block_end; j++) { 354*33e3c889SJeremy L Thompson vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] * 3559e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size]) + 356*33e3c889SJeremy L Thompson uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] * 3579e82028bSJeremy L Thompson abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]); 3589e82028bSJeremy L Thompson } 359*33e3c889SJeremy L Thompson for (CeedSize j = 0; j < block_end; j++) { 3609e82028bSJeremy L Thompson CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; 3619e82028bSJeremy L Thompson } 3629e82028bSJeremy L Thompson } 3639e82028bSJeremy L Thompson } 3649e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 3659e82028bSJeremy L Thompson } 3669e82028bSJeremy L Thompson 3679e82028bSJeremy L Thompson static inline int CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, CeedInt start, 3689e82028bSJeremy L Thompson CeedInt stop, CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu, 3699e82028bSJeremy L Thompson CeedScalar *__restrict__ vv) { 370740363ccSJeremy L Thompson CeedInt num_points, l_vec_offset; 371740363ccSJeremy L Thompson CeedSize e_vec_offset = 0; 3729e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 3739e82028bSJeremy L Thompson 3749e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 375*33e3c889SJeremy L Thompson for (CeedSize e = start; e < stop; e++) { 3769e82028bSJeremy L Thompson l_vec_offset = impl->offsets[e]; 3779e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr, e, &num_points)); 3789e82028bSJeremy L Thompson if (t_mode == CEED_NOTRANSPOSE) { 379*33e3c889SJeremy L Thompson for (CeedSize i = 0; i < num_points; i++) { 380*33e3c889SJeremy L Thompson for (CeedSize j = 0; j < num_comp; j++) vv[j * num_points + i + e_vec_offset] = uu[impl->offsets[i + l_vec_offset] * num_comp + j]; 3819e82028bSJeremy L Thompson } 3829e82028bSJeremy L Thompson } else { 383*33e3c889SJeremy L Thompson for (CeedSize i = 0; i < num_points; i++) { 384*33e3c889SJeremy L Thompson for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] = uu[j * num_points + i + e_vec_offset]; 3859e82028bSJeremy L Thompson } 3869e82028bSJeremy L Thompson } 387740363ccSJeremy L Thompson e_vec_offset += num_points * (CeedSize)num_comp; 3889e82028bSJeremy L Thompson } 3899e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 3909e82028bSJeremy L Thompson } 3919e82028bSJeremy L Thompson 3929e82028bSJeremy L Thompson static inline int CeedElemRestrictionApply_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, 3939e82028bSJeremy L Thompson const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, 3949e82028bSJeremy L Thompson bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { 395740363ccSJeremy L Thompson CeedInt num_elem, elem_size; 396740363ccSJeremy L Thompson CeedSize v_offset; 3979e82028bSJeremy L Thompson CeedRestrictionType rstr_type; 3989e82028bSJeremy L Thompson const CeedScalar *uu; 3999e82028bSJeremy L Thompson CeedScalar *vv; 4009e82028bSJeremy L Thompson 4019e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 4029e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 403740363ccSJeremy L Thompson v_offset = start * block_size * elem_size * (CeedSize)num_comp; 4049e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 4059e82028bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu)); 4069e82028bSJeremy L Thompson 4079e82028bSJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 4089e82028bSJeremy L Thompson // Sum into for transpose mode, E-vector to L-vector 4099e82028bSJeremy L Thompson CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_HOST, &vv)); 4109e82028bSJeremy L Thompson } else { 4119e82028bSJeremy L Thompson // Overwrite for notranspose mode, L-vector to E-vector 4129e82028bSJeremy L Thompson CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_HOST, &vv)); 4139e82028bSJeremy L Thompson } 4149e82028bSJeremy L Thompson 4159e82028bSJeremy L Thompson if (t_mode == CEED_TRANSPOSE) { 4169e82028bSJeremy L Thompson // Restriction from E-vector to L-vector 4179e82028bSJeremy L Thompson // Performing v += r^T * u 4189e82028bSJeremy L Thompson // uu has shape [elem_size, num_comp, num_elem], row-major 4199e82028bSJeremy L Thompson // vv has shape [nnodes, num_comp] 4209e82028bSJeremy L Thompson // Sum into for transpose mode 4219e82028bSJeremy L Thompson switch (rstr_type) { 4229e82028bSJeremy L Thompson case CEED_RESTRICTION_STRIDED: 4239e82028bSJeremy L Thompson CeedCallBackend( 4249e82028bSJeremy L Thompson CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv)); 4259e82028bSJeremy L Thompson break; 4269e82028bSJeremy L Thompson case CEED_RESTRICTION_STANDARD: 4279e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, 4289e82028bSJeremy L Thompson elem_size, v_offset, uu, vv)); 4299e82028bSJeremy L Thompson break; 4309e82028bSJeremy L Thompson case CEED_RESTRICTION_ORIENTED: 4319e82028bSJeremy L Thompson if (use_signs) { 4329e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyOrientedTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, 4339e82028bSJeremy L Thompson elem_size, v_offset, uu, vv)); 4349e82028bSJeremy L Thompson } else { 4359e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, 4369e82028bSJeremy L Thompson elem_size, v_offset, uu, vv)); 4379e82028bSJeremy L Thompson } 4389e82028bSJeremy L Thompson break; 4399e82028bSJeremy L Thompson case CEED_RESTRICTION_CURL_ORIENTED: 4409e82028bSJeremy L Thompson if (use_signs && use_orients) { 4419e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyCurlOrientedTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, 4429e82028bSJeremy L Thompson elem_size, v_offset, uu, vv)); 4439e82028bSJeremy L Thompson } else if (use_orients) { 4449e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, 4459e82028bSJeremy L Thompson num_elem, elem_size, v_offset, uu, vv)); 4469e82028bSJeremy L Thompson } else { 4479e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, 4489e82028bSJeremy L Thompson elem_size, v_offset, uu, vv)); 4499e82028bSJeremy L Thompson } 4509e82028bSJeremy L Thompson break; 4519e82028bSJeremy L Thompson case CEED_RESTRICTION_POINTS: 4529e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(rstr, num_comp, start, stop, t_mode, uu, vv)); 4539e82028bSJeremy L Thompson break; 4549e82028bSJeremy L Thompson } 4559e82028bSJeremy L Thompson } else { 4569e82028bSJeremy L Thompson // Restriction from L-vector to E-vector 4579e82028bSJeremy L Thompson // Perform: v = r * u 4589e82028bSJeremy L Thompson // vv has shape [elem_size, num_comp, num_elem], row-major 4599e82028bSJeremy L Thompson // uu has shape [nnodes, num_comp] 4609e82028bSJeremy L Thompson // Overwrite for notranspose mode 4619e82028bSJeremy L Thompson switch (rstr_type) { 4629e82028bSJeremy L Thompson case CEED_RESTRICTION_STRIDED: 4639e82028bSJeremy L Thompson CeedCallBackend( 4649e82028bSJeremy L Thompson CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv)); 4659e82028bSJeremy L Thompson break; 4669e82028bSJeremy L Thompson case CEED_RESTRICTION_STANDARD: 4679e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, 4689e82028bSJeremy L Thompson elem_size, v_offset, uu, vv)); 4699e82028bSJeremy L Thompson break; 4709e82028bSJeremy L Thompson case CEED_RESTRICTION_ORIENTED: 4719e82028bSJeremy L Thompson if (use_signs) { 4729e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, 4739e82028bSJeremy L Thompson elem_size, v_offset, uu, vv)); 4749e82028bSJeremy L Thompson } else { 4759e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, 4769e82028bSJeremy L Thompson elem_size, v_offset, uu, vv)); 4779e82028bSJeremy L Thompson } 4789e82028bSJeremy L Thompson break; 4799e82028bSJeremy L Thompson case CEED_RESTRICTION_CURL_ORIENTED: 4809e82028bSJeremy L Thompson if (use_signs && use_orients) { 4819e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, 4829e82028bSJeremy L Thompson num_elem, elem_size, v_offset, uu, vv)); 4839e82028bSJeremy L Thompson } else if (use_orients) { 4849e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, 4859e82028bSJeremy L Thompson num_elem, elem_size, v_offset, uu, vv)); 4869e82028bSJeremy L Thompson } else { 4879e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem, 4889e82028bSJeremy L Thompson elem_size, v_offset, uu, vv)); 4899e82028bSJeremy L Thompson } 4909e82028bSJeremy L Thompson break; 4919e82028bSJeremy L Thompson case CEED_RESTRICTION_POINTS: 4929e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(rstr, num_comp, start, stop, t_mode, uu, vv)); 4939e82028bSJeremy L Thompson break; 4949e82028bSJeremy L Thompson } 4959e82028bSJeremy L Thompson } 4969e82028bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArrayRead(u, &uu)); 4979e82028bSJeremy L Thompson CeedCallBackend(CeedVectorRestoreArray(v, &vv)); 4989e82028bSJeremy L Thompson if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL; 4999e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 5009e82028bSJeremy L Thompson } 5019e82028bSJeremy L Thompson 5029e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5039e82028bSJeremy L Thompson // ElemRestriction Apply 5049e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5059e82028bSJeremy L Thompson static int CeedElemRestrictionApply_Memcheck(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) { 5069e82028bSJeremy L Thompson CeedInt num_block, block_size, num_comp, comp_stride; 5079e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 5089e82028bSJeremy L Thompson 5099e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block)); 5109e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size)); 5119e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 5129e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); 5139e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 5149e82028bSJeremy L Thompson CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, 0, num_block, t_mode, true, true, u, v, request)); 5159e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 5169e82028bSJeremy L Thompson } 5179e82028bSJeremy L Thompson 5189e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5199e82028bSJeremy L Thompson // ElemRestriction Apply Unsigned 5209e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5219e82028bSJeremy L Thompson static int CeedElemRestrictionApplyUnsigned_Memcheck(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, 5229e82028bSJeremy L Thompson CeedRequest *request) { 5239e82028bSJeremy L Thompson CeedInt num_block, block_size, num_comp, comp_stride; 5249e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 5259e82028bSJeremy L Thompson 5269e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block)); 5279e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size)); 5289e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 5299e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); 5309e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 5319e82028bSJeremy L Thompson CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, 0, num_block, t_mode, false, true, u, v, request)); 5329e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 5339e82028bSJeremy L Thompson } 5349e82028bSJeremy L Thompson 5359e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5369e82028bSJeremy L Thompson // ElemRestriction Apply Unoriented 5379e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5389e82028bSJeremy L Thompson static int CeedElemRestrictionApplyUnoriented_Memcheck(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, 5399e82028bSJeremy L Thompson CeedRequest *request) { 5409e82028bSJeremy L Thompson CeedInt num_block, block_size, num_comp, comp_stride; 5419e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 5429e82028bSJeremy L Thompson 5439e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block)); 5449e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size)); 5459e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 5469e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); 5479e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 5489e82028bSJeremy L Thompson CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, 0, num_block, t_mode, false, false, u, v, request)); 5499e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 5509e82028bSJeremy L Thompson } 5519e82028bSJeremy L Thompson 5529e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5539e82028bSJeremy L Thompson // ElemRestriction Apply Points 5549e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5559e82028bSJeremy L Thompson static int CeedElemRestrictionApplyAtPointsInElement_Memcheck(CeedElemRestriction rstr, CeedInt elem, CeedTransposeMode t_mode, CeedVector u, 5569e82028bSJeremy L Thompson CeedVector v, CeedRequest *request) { 5579e82028bSJeremy L Thompson CeedInt num_comp; 5589e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 5599e82028bSJeremy L Thompson 5609e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 5619e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 5629e82028bSJeremy L Thompson return impl->Apply(rstr, num_comp, 0, 1, elem, elem + 1, t_mode, false, false, u, v, request); 5639e82028bSJeremy L Thompson } 5649e82028bSJeremy L Thompson 5659e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5669e82028bSJeremy L Thompson // ElemRestriction Apply Block 5679e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5689e82028bSJeremy L Thompson static int CeedElemRestrictionApplyBlock_Memcheck(CeedElemRestriction rstr, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector v, 5699e82028bSJeremy L Thompson CeedRequest *request) { 5709e82028bSJeremy L Thompson CeedInt block_size, num_comp, comp_stride; 5719e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 5729e82028bSJeremy L Thompson 5739e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size)); 5749e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 5759e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); 5769e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 5779e82028bSJeremy L Thompson CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, block, block + 1, t_mode, true, true, u, v, request)); 5789e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 5799e82028bSJeremy L Thompson } 5809e82028bSJeremy L Thompson 5819e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5829e82028bSJeremy L Thompson // ElemRestriction Get Offsets 5839e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5849e82028bSJeremy L Thompson static int CeedElemRestrictionGetOffsets_Memcheck(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) { 5859e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 5869e82028bSJeremy L Thompson 5879e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 5889e82028bSJeremy L Thompson 5896e536b99SJeremy L Thompson CeedCheck(mem_type == CEED_MEM_HOST, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_BACKEND, "Can only provide to HOST memory"); 5909e82028bSJeremy L Thompson 5919e82028bSJeremy L Thompson *offsets = impl->offsets; 5929e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 5939e82028bSJeremy L Thompson } 5949e82028bSJeremy L Thompson 5959e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5969e82028bSJeremy L Thompson // ElemRestriction Get Orientations 5979e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 5989e82028bSJeremy L Thompson static int CeedElemRestrictionGetOrientations_Memcheck(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) { 5999e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 6009e82028bSJeremy L Thompson 6019e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 6029e82028bSJeremy L Thompson 6036e536b99SJeremy L Thompson CeedCheck(mem_type == CEED_MEM_HOST, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_BACKEND, "Can only provide to HOST memory"); 6049e82028bSJeremy L Thompson 6059e82028bSJeremy L Thompson *orients = impl->orients; 6069e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 6079e82028bSJeremy L Thompson } 6089e82028bSJeremy L Thompson 6099e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 6109e82028bSJeremy L Thompson // ElemRestriction Get Curl-Conforming Orientations 6119e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 6129e82028bSJeremy L Thompson static int CeedElemRestrictionGetCurlOrientations_Memcheck(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) { 6139e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 6149e82028bSJeremy L Thompson 6159e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 6169e82028bSJeremy L Thompson 6176e536b99SJeremy L Thompson CeedCheck(mem_type == CEED_MEM_HOST, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_BACKEND, "Can only provide to HOST memory"); 6189e82028bSJeremy L Thompson 6199e82028bSJeremy L Thompson *curl_orients = impl->curl_orients; 6209e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 6219e82028bSJeremy L Thompson } 6229e82028bSJeremy L Thompson 6239e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 6249e82028bSJeremy L Thompson // ElemRestriction Destroy 6259e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 6269e82028bSJeremy L Thompson static int CeedElemRestrictionDestroy_Memcheck(CeedElemRestriction rstr) { 6279e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 6289e82028bSJeremy L Thompson 6299e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 6309e82028bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->offsets_allocated)); 6319e82028bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->orients_allocated)); 6329e82028bSJeremy L Thompson CeedCallBackend(CeedFree(&impl->curl_orients_allocated)); 6339e82028bSJeremy L Thompson CeedCallBackend(CeedFree(&impl)); 6349e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 6359e82028bSJeremy L Thompson } 6369e82028bSJeremy L Thompson 6379e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 6389e82028bSJeremy L Thompson // ElemRestriction Create 6399e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 6409e82028bSJeremy L Thompson int CeedElemRestrictionCreate_Memcheck(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, const bool *orients, 6419e82028bSJeremy L Thompson const CeedInt8 *curl_orients, CeedElemRestriction rstr) { 6429e82028bSJeremy L Thompson Ceed ceed; 6439e82028bSJeremy L Thompson CeedInt num_elem, elem_size, num_block, block_size, num_comp, comp_stride, num_points = 0, num_offsets; 6449e82028bSJeremy L Thompson CeedRestrictionType rstr_type; 6459e82028bSJeremy L Thompson CeedElemRestriction_Memcheck *impl; 6469e82028bSJeremy L Thompson 6479e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed)); 6489e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 6499e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); 6509e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block)); 6519e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size)); 6529e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp)); 6539e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride)); 65422eb1385SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type)); 6559e82028bSJeremy L Thompson 6569e82028bSJeremy L Thompson CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Only MemType = HOST supported"); 6579e82028bSJeremy L Thompson 6589e82028bSJeremy L Thompson CeedCallBackend(CeedCalloc(1, &impl)); 6599e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionSetData(rstr, impl)); 66022eb1385SJeremy L Thompson 66122eb1385SJeremy L Thompson // Set layouts 66222eb1385SJeremy L Thompson { 66322eb1385SJeremy L Thompson bool has_backend_strides; 66422eb1385SJeremy L Thompson CeedInt e_layout[3] = {1, elem_size, elem_size * num_comp}, l_layout[3] = {0}; 66522eb1385SJeremy L Thompson 66622eb1385SJeremy L Thompson CeedCallBackend(CeedElemRestrictionSetELayout(rstr, e_layout)); 66722eb1385SJeremy L Thompson if (rstr_type == CEED_RESTRICTION_STRIDED) { 66822eb1385SJeremy L Thompson CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); 66922eb1385SJeremy L Thompson if (has_backend_strides) { 67022eb1385SJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetBackendStrides_Memcheck(rstr, l_layout)); 67122eb1385SJeremy L Thompson CeedCallBackend(CeedElemRestrictionSetLLayout(rstr, l_layout)); 67222eb1385SJeremy L Thompson } 67322eb1385SJeremy L Thompson } 67422eb1385SJeremy L Thompson } 6759e82028bSJeremy L Thompson 6769e82028bSJeremy L Thompson // Offsets data 6779e82028bSJeremy L Thompson if (rstr_type != CEED_RESTRICTION_STRIDED) { 6789e82028bSJeremy L Thompson const char *resource; 6799e82028bSJeremy L Thompson 6809e82028bSJeremy L Thompson // Check indices for ref or memcheck backends 6819e82028bSJeremy L Thompson { 6829e82028bSJeremy L Thompson Ceed current = ceed, parent = NULL; 6839e82028bSJeremy L Thompson 6849e82028bSJeremy L Thompson CeedCallBackend(CeedGetParent(current, &parent)); 6859e82028bSJeremy L Thompson while (current != parent) { 6869e82028bSJeremy L Thompson current = parent; 6879e82028bSJeremy L Thompson CeedCallBackend(CeedGetParent(current, &parent)); 6889e82028bSJeremy L Thompson } 6899e82028bSJeremy L Thompson CeedCallBackend(CeedGetResource(parent, &resource)); 6909e82028bSJeremy L Thompson } 6919e82028bSJeremy L Thompson if (!strcmp(resource, "/cpu/self/ref/serial") || !strcmp(resource, "/cpu/self/ref/blocked") || !strcmp(resource, "/cpu/self/memcheck/serial") || 6929e82028bSJeremy L Thompson !strcmp(resource, "/cpu/self/memcheck/blocked")) { 6939e82028bSJeremy L Thompson CeedSize l_size; 6949e82028bSJeremy L Thompson 6959e82028bSJeremy L Thompson CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size)); 6969e82028bSJeremy L Thompson for (CeedInt i = 0; i < num_elem * elem_size; i++) { 6979e82028bSJeremy L Thompson CeedCheck(offsets[i] >= 0 && offsets[i] + (num_comp - 1) * comp_stride < l_size, ceed, CEED_ERROR_BACKEND, 6989e82028bSJeremy L Thompson "Restriction offset %" CeedInt_FMT " (%" CeedInt_FMT ") out of range [0, %" CeedInt_FMT "]", i, offsets[i], l_size); 6999e82028bSJeremy L Thompson } 7009e82028bSJeremy L Thompson } 7019e82028bSJeremy L Thompson 7029e82028bSJeremy L Thompson // Copy data 7039e82028bSJeremy L Thompson if (rstr_type == CEED_RESTRICTION_POINTS) CeedCallBackend(CeedElemRestrictionGetNumPoints(rstr, &num_points)); 7049e82028bSJeremy L Thompson num_offsets = rstr_type == CEED_RESTRICTION_POINTS ? (num_elem + 1 + num_points) : (num_elem * elem_size); 7059e82028bSJeremy L Thompson switch (copy_mode) { 7069e82028bSJeremy L Thompson case CEED_COPY_VALUES: 7079e82028bSJeremy L Thompson CeedCallBackend(CeedMalloc(num_offsets, &impl->offsets_allocated)); 7089e82028bSJeremy L Thompson memcpy(impl->offsets_allocated, offsets, num_offsets * sizeof(offsets[0])); 7099e82028bSJeremy L Thompson impl->offsets = impl->offsets_allocated; 7109e82028bSJeremy L Thompson break; 7119e82028bSJeremy L Thompson case CEED_OWN_POINTER: 7129e82028bSJeremy L Thompson impl->offsets_allocated = (CeedInt *)offsets; 7139e82028bSJeremy L Thompson impl->offsets = impl->offsets_allocated; 7149e82028bSJeremy L Thompson break; 7159e82028bSJeremy L Thompson case CEED_USE_POINTER: 7169e82028bSJeremy L Thompson impl->offsets = offsets; 7179e82028bSJeremy L Thompson } 7189e82028bSJeremy L Thompson 7199e82028bSJeremy L Thompson // Orientation data 7209e82028bSJeremy L Thompson if (rstr_type == CEED_RESTRICTION_ORIENTED) { 7219e82028bSJeremy L Thompson CeedCheck(orients != NULL, ceed, CEED_ERROR_BACKEND, "No orients array provided for oriented restriction"); 7229e82028bSJeremy L Thompson switch (copy_mode) { 7239e82028bSJeremy L Thompson case CEED_COPY_VALUES: 7249e82028bSJeremy L Thompson CeedCallBackend(CeedMalloc(num_offsets, &impl->orients_allocated)); 7259e82028bSJeremy L Thompson memcpy(impl->orients_allocated, orients, num_offsets * sizeof(orients[0])); 7269e82028bSJeremy L Thompson impl->orients = impl->orients_allocated; 7279e82028bSJeremy L Thompson break; 7289e82028bSJeremy L Thompson case CEED_OWN_POINTER: 7299e82028bSJeremy L Thompson impl->orients_allocated = (bool *)orients; 7309e82028bSJeremy L Thompson impl->orients = impl->orients_allocated; 7319e82028bSJeremy L Thompson break; 7329e82028bSJeremy L Thompson case CEED_USE_POINTER: 7339e82028bSJeremy L Thompson impl->orients = orients; 7349e82028bSJeremy L Thompson } 7359e82028bSJeremy L Thompson } else if (rstr_type == CEED_RESTRICTION_CURL_ORIENTED) { 7369e82028bSJeremy L Thompson CeedCheck(curl_orients != NULL, ceed, CEED_ERROR_BACKEND, "No curl_orients array provided for oriented restriction"); 7379e82028bSJeremy L Thompson switch (copy_mode) { 7389e82028bSJeremy L Thompson case CEED_COPY_VALUES: 7399e82028bSJeremy L Thompson CeedCallBackend(CeedMalloc(3 * num_offsets, &impl->curl_orients_allocated)); 7409e82028bSJeremy L Thompson memcpy(impl->curl_orients_allocated, curl_orients, 3 * num_offsets * sizeof(curl_orients[0])); 7419e82028bSJeremy L Thompson impl->curl_orients = impl->curl_orients_allocated; 7429e82028bSJeremy L Thompson break; 7439e82028bSJeremy L Thompson case CEED_OWN_POINTER: 7449e82028bSJeremy L Thompson impl->curl_orients_allocated = (CeedInt8 *)curl_orients; 7459e82028bSJeremy L Thompson impl->curl_orients = impl->curl_orients_allocated; 7469e82028bSJeremy L Thompson break; 7479e82028bSJeremy L Thompson case CEED_USE_POINTER: 7489e82028bSJeremy L Thompson impl->curl_orients = curl_orients; 7499e82028bSJeremy L Thompson } 7509e82028bSJeremy L Thompson } 7519e82028bSJeremy L Thompson } 7529e82028bSJeremy L Thompson 7539e82028bSJeremy L Thompson // Set apply function 7549e82028bSJeremy L Thompson impl->Apply = CeedElemRestrictionApply_Memcheck_Core; 7559e82028bSJeremy L Thompson 7569e82028bSJeremy L Thompson // Register backend functions 7579e82028bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Memcheck)); 7589e82028bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Memcheck)); 7599e82028bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Memcheck)); 7609e82028bSJeremy L Thompson if (rstr_type == CEED_RESTRICTION_POINTS) { 7619e82028bSJeremy L Thompson CeedCallBackend( 7629e82028bSJeremy L Thompson CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement", CeedElemRestrictionApplyAtPointsInElement_Memcheck)); 7639e82028bSJeremy L Thompson } 7649e82028bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyBlock", CeedElemRestrictionApplyBlock_Memcheck)); 7659e82028bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Memcheck)); 7669e82028bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Memcheck)); 7679e82028bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Memcheck)); 7689e82028bSJeremy L Thompson CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Memcheck)); 7699e82028bSJeremy L Thompson return CEED_ERROR_SUCCESS; 7709e82028bSJeremy L Thompson } 7719e82028bSJeremy L Thompson 7729e82028bSJeremy L Thompson //------------------------------------------------------------------------------ 773