| ceed-ref-restriction.c (6ef841ab9f29fd636b2e39e9899cdac3f7a48cc9) | ceed-ref-restriction.c (4baa7aec0dd4cb16a4e44df395cefec334617a2e) |
|---|---|
| 1// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3// 4// SPDX-License-Identifier: BSD-2-Clause 5// 6// This file is part of CEED: http://github.com/ceed 7 8#include <ceed.h> 9#include <ceed/backend.h> 10#include <stdbool.h> 11#include <stdlib.h> 12#include <string.h> 13 14#include "ceed-ref.h" 15 16//------------------------------------------------------------------------------ 17// Core ElemRestriction Apply Code 18//------------------------------------------------------------------------------ 19static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, | 1// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors. 2// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3// 4// SPDX-License-Identifier: BSD-2-Clause 5// 6// This file is part of CEED: http://github.com/ceed 7 8#include <ceed.h> 9#include <ceed/backend.h> 10#include <stdbool.h> 11#include <stdlib.h> 12#include <string.h> 13 14#include "ceed-ref.h" 15 16//------------------------------------------------------------------------------ 17// Core ElemRestriction Apply Code 18//------------------------------------------------------------------------------ 19static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, |
| 20 CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, 21 CeedSize v_offset, const CeedScalar *__restrict__ uu, | 20 const CeedInt start, const CeedInt stop, const CeedInt num_elem, 21 const CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, |
| 22 CeedScalar *__restrict__ vv) { 23 // No offsets provided, identity restriction 24 bool has_backend_strides; 25 26 CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); 27 if (has_backend_strides) { 28 // CPU backend strides are {1, elem_size, elem_size*num_comp} 29 // This if branch is left separate to allow better inlining 30 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { | 22 CeedScalar *__restrict__ vv) { 23 // No offsets provided, identity restriction 24 bool has_backend_strides; 25 26 CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); 27 if (has_backend_strides) { 28 // CPU backend strides are {1, elem_size, elem_size*num_comp} 29 // This if branch is left separate to allow better inlining 30 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { |
| 31 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { 32 CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) { | 31 for (CeedSize k = 0; k < num_comp; k++) { 32 for (CeedSize n = 0; n < elem_size; n++) { |
| 33 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 34 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 35 uu[n + k * elem_size + CeedIntMin(e + j, num_elem - 1) * elem_size * (CeedSize)num_comp]; 36 } 37 } 38 } 39 } 40 } else { 41 // User provided strides 42 CeedInt strides[3]; 43 44 CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides)); 45 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { | 33 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 34 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 35 uu[n + k * elem_size + CeedIntMin(e + j, num_elem - 1) * elem_size * (CeedSize)num_comp]; 36 } 37 } 38 } 39 } 40 } else { 41 // User provided strides 42 CeedInt strides[3]; 43 44 CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides)); 45 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { |
| 46 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { 47 CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) { | 46 for (CeedSize k = 0; k < num_comp; k++) { 47 for (CeedSize n = 0; n < elem_size; n++) { |
| 48 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 49 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 50 uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * (CeedSize)strides[2]]; 51 } 52 } 53 } 54 } 55 } 56 return CEED_ERROR_SUCCESS; 57} 58 59static inline int CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, | 48 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 49 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 50 uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * (CeedSize)strides[2]]; 51 } 52 } 53 } 54 } 55 } 56 return CEED_ERROR_SUCCESS; 57} 58 59static inline int CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, |
| 60 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, 61 CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, 62 CeedScalar *__restrict__ vv) { | 60 const CeedInt comp_stride, const CeedInt start, const CeedInt stop, 61 const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset, 62 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { |
| 63 // Default restriction with offsets 64 CeedElemRestriction_Ref *impl; 65 66 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 67 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { | 63 // Default restriction with offsets 64 CeedElemRestriction_Ref *impl; 65 66 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 67 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { |
| 68 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { | 68 for (CeedSize k = 0; k < num_comp; k++) { |
| 69 CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) { 70 vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride]; 71 } 72 } 73 } 74 return CEED_ERROR_SUCCESS; 75} 76 77static inline int CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, | 69 CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) { 70 vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride]; 71 } 72 } 73 } 74 return CEED_ERROR_SUCCESS; 75} 76 77static inline int CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, |
| 78 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, 79 CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, 80 CeedScalar *__restrict__ vv) { | 78 const CeedInt comp_stride, const CeedInt start, const CeedInt stop, 79 const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset, 80 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { |
| 81 // Restriction with orientations 82 CeedElemRestriction_Ref *impl; 83 84 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 85 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { | 81 // Restriction with orientations 82 CeedElemRestriction_Ref *impl; 83 84 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 85 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { |
| 86 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { | 86 for (CeedSize k = 0; k < num_comp; k++) { |
| 87 CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) { 88 vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = 89 uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0); 90 } 91 } 92 } 93 return CEED_ERROR_SUCCESS; 94} 95 96static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, | 87 CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) { 88 vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = 89 uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0); 90 } 91 } 92 } 93 return CEED_ERROR_SUCCESS; 94} 95 96static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, |
| 97 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, 98 CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, 99 CeedScalar *__restrict__ vv) { | 97 const CeedInt comp_stride, const CeedInt start, const CeedInt stop, 98 const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset, 99 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { |
| 100 // Restriction with tridiagonal transformation 101 CeedElemRestriction_Ref *impl; 102 103 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 104 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { | 100 // Restriction with tridiagonal transformation 101 CeedElemRestriction_Ref *impl; 102 103 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 104 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { |
| 105 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { | 105 for (CeedSize k = 0; k < num_comp; k++) { |
| 106 CeedSize n = 0; 107 108 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 109 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 110 uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 111 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] + 112 uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * 113 impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]; 114 } | 106 CeedSize n = 0; 107 108 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 109 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 110 uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 111 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] + 112 uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * 113 impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]; 114 } |
| 115 CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) { | 115 for (n = 1; n < elem_size - 1; n++) { |
| 116 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 117 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 118 uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * 119 impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size] + 120 uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 121 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] + 122 uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * 123 impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]; --- 7 unchanged lines hidden (view full) --- 131 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]; 132 } 133 } 134 } 135 return CEED_ERROR_SUCCESS; 136} 137 138static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, | 116 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 117 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 118 uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * 119 impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size] + 120 uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 121 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] + 122 uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * 123 impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]; --- 7 unchanged lines hidden (view full) --- 131 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]; 132 } 133 } 134 } 135 return CEED_ERROR_SUCCESS; 136} 137 138static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, |
| 139 const CeedInt block_size, const CeedInt comp_stride, CeedInt start, 140 CeedInt stop, CeedInt num_elem, CeedInt elem_size, 141 CeedSize v_offset, const CeedScalar *__restrict__ uu, 142 CeedScalar *__restrict__ vv) { | 139 const CeedInt block_size, const CeedInt comp_stride, 140 const CeedInt start, const CeedInt stop, const CeedInt num_elem, 141 const CeedInt elem_size, const CeedSize v_offset, 142 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { |
| 143 // Restriction with (unsigned) tridiagonal transformation 144 CeedElemRestriction_Ref *impl; 145 146 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 147 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { | 143 // Restriction with (unsigned) tridiagonal transformation 144 CeedElemRestriction_Ref *impl; 145 146 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 147 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { |
| 148 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { | 148 for (CeedSize k = 0; k < num_comp; k++) { |
| 149 CeedSize n = 0; 150 151 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 152 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 153 uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 154 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) + 155 uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * 156 abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]); 157 } | 149 CeedSize n = 0; 150 151 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 152 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 153 uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 154 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) + 155 uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * 156 abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]); 157 } |
| 158 CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) { | 158 for (n = 1; n < elem_size - 1; n++) { |
| 159 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 160 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 161 uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * 162 abs(impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size]) + 163 uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 164 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) + 165 uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * 166 abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]); --- 7 unchanged lines hidden (view full) --- 174 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]); 175 } 176 } 177 } 178 return CEED_ERROR_SUCCESS; 179} 180 181static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, | 159 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) { 160 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] = 161 uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] * 162 abs(impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size]) + 163 uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] * 164 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) + 165 uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] * 166 abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]); --- 7 unchanged lines hidden (view full) --- 174 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]); 175 } 176 } 177 } 178 return CEED_ERROR_SUCCESS; 179} 180 181static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, |
| 182 CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size, 183 CeedSize v_offset, const CeedScalar *__restrict__ uu, 184 CeedScalar *__restrict__ vv) { | 182 const CeedInt start, const CeedInt stop, const CeedInt num_elem, 183 const CeedInt elem_size, const CeedSize v_offset, 184 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { |
| 185 // No offsets provided, identity restriction 186 bool has_backend_strides; 187 188 CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); 189 if (has_backend_strides) { 190 // CPU backend strides are {1, elem_size, elem_size*num_comp} 191 // This if brach is left separate to allow better inlining 192 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { | 185 // No offsets provided, identity restriction 186 bool has_backend_strides; 187 188 CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides)); 189 if (has_backend_strides) { 190 // CPU backend strides are {1, elem_size, elem_size*num_comp} 191 // This if brach is left separate to allow better inlining 192 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { |
| 193 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { 194 CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) { | 193 for (CeedSize k = 0; k < num_comp; k++) { 194 for (CeedSize n = 0; n < elem_size; n++) { |
| 195 CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) { 196 vv[n + k * elem_size + (e + j) * elem_size * num_comp] += uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset]; 197 } 198 } 199 } 200 } 201 } else { 202 // User provided strides 203 CeedInt strides[3]; 204 205 CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides)); 206 for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { | 195 CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) { 196 vv[n + k * elem_size + (e + j) * elem_size * num_comp] += uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset]; 197 } 198 } 199 } 200 } 201 } else { 202 // User provided strides 203 CeedInt strides[3]; 204 205 CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides)); 206 for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) { |
| 207 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) { 208 CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) { | 207 for (CeedSize k = 0; k < num_comp; k++) { 208 for (CeedSize n = 0; n < elem_size; n++) { |
| 209 CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) { 210 vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] += 211 uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset]; 212 } 213 } 214 } 215 } 216 } 217 return CEED_ERROR_SUCCESS; 218} 219 220static inline int CeedElemRestrictionApplyOffsetTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, | 209 CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) { 210 vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] += 211 uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset]; 212 } 213 } 214 } 215 } 216 } 217 return CEED_ERROR_SUCCESS; 218} 219 220static inline int CeedElemRestrictionApplyOffsetTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, |
| 221 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, 222 CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, 223 CeedScalar *__restrict__ vv) { | 221 const CeedInt comp_stride, const CeedInt start, const CeedInt stop, 222 const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset, 223 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { |
| 224 // Default restriction with offsets 225 CeedElemRestriction_Ref *impl; 226 227 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 228 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 229 for (CeedSize k = 0; k < num_comp; k++) { 230 for (CeedSize i = 0; i < elem_size * block_size; i += block_size) { 231 // Iteration bound set to discard padding elements --- 5 unchanged lines hidden (view full) --- 237 } 238 } 239 } 240 } 241 return CEED_ERROR_SUCCESS; 242} 243 244static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, | 224 // Default restriction with offsets 225 CeedElemRestriction_Ref *impl; 226 227 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 228 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 229 for (CeedSize k = 0; k < num_comp; k++) { 230 for (CeedSize i = 0; i < elem_size * block_size; i += block_size) { 231 // Iteration bound set to discard padding elements --- 5 unchanged lines hidden (view full) --- 237 } 238 } 239 } 240 } 241 return CEED_ERROR_SUCCESS; 242} 243 244static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, |
| 245 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, 246 CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, 247 CeedScalar *__restrict__ vv) { | 245 const CeedInt comp_stride, const CeedInt start, const CeedInt stop, 246 const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset, 247 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { |
| 248 // Restriction with orientations 249 CeedElemRestriction_Ref *impl; 250 251 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 252 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 253 for (CeedSize k = 0; k < num_comp; k++) { 254 for (CeedSize i = 0; i < elem_size * block_size; i += block_size) { 255 // Iteration bound set to discard padding elements --- 5 unchanged lines hidden (view full) --- 261 } 262 } 263 } 264 } 265 return CEED_ERROR_SUCCESS; 266} 267 268static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, | 248 // Restriction with orientations 249 CeedElemRestriction_Ref *impl; 250 251 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 252 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 253 for (CeedSize k = 0; k < num_comp; k++) { 254 for (CeedSize i = 0; i < elem_size * block_size; i += block_size) { 255 // Iteration bound set to discard padding elements --- 5 unchanged lines hidden (view full) --- 261 } 262 } 263 } 264 } 265 return CEED_ERROR_SUCCESS; 266} 267 268static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, |
| 269 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem, 270 CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, 271 CeedScalar *__restrict__ vv) { | 269 const CeedInt comp_stride, const CeedInt start, const CeedInt stop, 270 const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset, 271 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { |
| 272 // Restriction with tridiagonal transformation 273 CeedElemRestriction_Ref *impl; 274 CeedScalar vv_loc[block_size]; 275 276 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 277 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 278 for (CeedSize k = 0; k < num_comp; k++) { 279 // Iteration bound set to discard padding elements --- 32 unchanged lines hidden (view full) --- 312 CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; 313 } 314 } 315 } 316 return CEED_ERROR_SUCCESS; 317} 318 319static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, | 272 // Restriction with tridiagonal transformation 273 CeedElemRestriction_Ref *impl; 274 CeedScalar vv_loc[block_size]; 275 276 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 277 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 278 for (CeedSize k = 0; k < num_comp; k++) { 279 // Iteration bound set to discard padding elements --- 32 unchanged lines hidden (view full) --- 312 CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; 313 } 314 } 315 } 316 return CEED_ERROR_SUCCESS; 317} 318 319static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, |
| 320 const CeedInt block_size, const CeedInt comp_stride, CeedInt start, 321 CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset, | 320 const CeedInt block_size, const CeedInt comp_stride, 321 const CeedInt start, const CeedInt stop, const CeedInt num_elem, 322 const CeedInt elem_size, const CeedSize v_offset, |
| 322 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { 323 // Restriction with (unsigned) tridiagonal transformation 324 CeedElemRestriction_Ref *impl; 325 CeedScalar vv_loc[block_size]; 326 327 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 328 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 329 for (CeedSize k = 0; k < num_comp; k++) { --- 32 unchanged lines hidden (view full) --- 362 for (CeedSize j = 0; j < block_end; j++) { 363 CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; 364 } 365 } 366 } 367 return CEED_ERROR_SUCCESS; 368} 369 | 323 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) { 324 // Restriction with (unsigned) tridiagonal transformation 325 CeedElemRestriction_Ref *impl; 326 CeedScalar vv_loc[block_size]; 327 328 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 329 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) { 330 for (CeedSize k = 0; k < num_comp; k++) { --- 32 unchanged lines hidden (view full) --- 363 for (CeedSize j = 0; j < block_end; j++) { 364 CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j]; 365 } 366 } 367 } 368 return CEED_ERROR_SUCCESS; 369} 370 |
| 370static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, CeedInt start, CeedInt stop, 371 CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu, | 371static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt start, 372 const CeedInt stop, CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu, |
| 372 CeedScalar *__restrict__ vv) { 373 CeedInt num_points, l_vec_offset; 374 CeedSize e_vec_offset = 0; 375 CeedElemRestriction_Ref *impl; 376 377 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 378 for (CeedInt e = start; e < stop; e++) { 379 l_vec_offset = impl->offsets[e]; --- 8 unchanged lines hidden (view full) --- 388 } 389 } 390 e_vec_offset += num_points * (CeedSize)num_comp; 391 } 392 return CEED_ERROR_SUCCESS; 393} 394 395static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, | 373 CeedScalar *__restrict__ vv) { 374 CeedInt num_points, l_vec_offset; 375 CeedSize e_vec_offset = 0; 376 CeedElemRestriction_Ref *impl; 377 378 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl)); 379 for (CeedInt e = start; e < stop; e++) { 380 l_vec_offset = impl->offsets[e]; --- 8 unchanged lines hidden (view full) --- 389 } 390 } 391 e_vec_offset += num_points * (CeedSize)num_comp; 392 } 393 return CEED_ERROR_SUCCESS; 394} 395 396static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, |
| 396 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs, 397 bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { | 397 const CeedInt comp_stride, const CeedInt start, const CeedInt stop, CeedTransposeMode t_mode, 398 bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) { |
| 398 CeedInt num_elem, elem_size; 399 CeedSize v_offset = 0; 400 CeedRestrictionType rstr_type; 401 const CeedScalar *uu; 402 CeedScalar *vv; 403 404 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 405 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); --- 506 unchanged lines hidden --- | 399 CeedInt num_elem, elem_size; 400 CeedSize v_offset = 0; 401 CeedRestrictionType rstr_type; 402 const CeedScalar *uu; 403 CeedScalar *vv; 404 405 CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem)); 406 CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size)); --- 506 unchanged lines hidden --- |