ceed-ref-restriction.c - OpenGrok cross reference for /libCEED/backends/ref/ceed-ref-restriction.c

1// Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3//
4// SPDX-License-Identifier: BSD-2-Clause
5//
6// This file is part of CEED: http://github.com/ceed
7
8#include <ceed.h>
9#include <ceed/backend.h>
10#include <stdbool.h>
11#include <stdlib.h>
12#include <string.h>
13
14#include "ceed-ref.h"
15
16//------------------------------------------------------------------------------
17// Core ElemRestriction Apply Code
18//------------------------------------------------------------------------------
19static inline int CeedElemRestrictionApplyStridedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,

20 CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size,
21 CeedSize v_offset, const CeedScalar *__restrict__ uu,

20 const CeedInt start, const CeedInt stop, const CeedInt num_elem,
21 const CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,

22 CeedScalar *__restrict__ vv) {
23 // No offsets provided, identity restriction
24 bool has_backend_strides;
25
26 CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
27 if (has_backend_strides) {
28 // CPU backend strides are {1, elem_size, elem_size*num_comp}
29 // This if branch is left separate to allow better inlining
30 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {

31 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
32 CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {

31 for (CeedSize k = 0; k < num_comp; k++) {
32 for (CeedSize n = 0; n < elem_size; n++) {

33 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
34 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
35 uu[n + k * elem_size + CeedIntMin(e + j, num_elem - 1) * elem_size * (CeedSize)num_comp];
36 }
37 }
38 }
39 }
40 } else {
41 // User provided strides
42 CeedInt strides[3];
43
44 CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
45 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {

46 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
47 CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {

46 for (CeedSize k = 0; k < num_comp; k++) {
47 for (CeedSize n = 0; n < elem_size; n++) {

48 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
49 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
50 uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * (CeedSize)strides[2]];
51 }
52 }
53 }
54 }
55 }
56 return CEED_ERROR_SUCCESS;
57}
58
59static inline int CeedElemRestrictionApplyOffsetNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,

60 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
61 CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
62 CeedScalar *__restrict__ vv) {

60 const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
61 const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
62 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {

63 // Default restriction with offsets
64 CeedElemRestriction_Ref *impl;
65
66 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
67 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {

68 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {

68 for (CeedSize k = 0; k < num_comp; k++) {

69 CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) {
70 vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride];
71 }
72 }
73 }
74 return CEED_ERROR_SUCCESS;
75}
76
77static inline int CeedElemRestrictionApplyOrientedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,

78 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
79 CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
80 CeedScalar *__restrict__ vv) {

78 const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
79 const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
80 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {

81 // Restriction with orientations
82 CeedElemRestriction_Ref *impl;
83
84 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
85 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {

86 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {

86 for (CeedSize k = 0; k < num_comp; k++) {

87 CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) {
88 vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] =
89 uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0);
90 }
91 }
92 }
93 return CEED_ERROR_SUCCESS;
94}
95
96static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,

97 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
98 CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
99 CeedScalar *__restrict__ vv) {

97 const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
98 const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
99 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {

100 // Restriction with tridiagonal transformation
101 CeedElemRestriction_Ref *impl;
102
103 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
104 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {

105 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {

105 for (CeedSize k = 0; k < num_comp; k++) {

106 CeedSize n = 0;
107
108 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
109 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
110 uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
111 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
112 uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
113 impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size];
114 }

115 CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {

115 for (n = 1; n < elem_size - 1; n++) {

116 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
117 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
118 uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
119 impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size] +
120 uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
121 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
122 uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
123 impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size];

--- 7 unchanged lines hidden (view full) ---

131 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size];
132 }
133 }
134 }
135 return CEED_ERROR_SUCCESS;
136}
137
138static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp,

139 const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
140 CeedInt stop, CeedInt num_elem, CeedInt elem_size,
141 CeedSize v_offset, const CeedScalar *__restrict__ uu,
142 CeedScalar *__restrict__ vv) {

139 const CeedInt block_size, const CeedInt comp_stride,
140 const CeedInt start, const CeedInt stop, const CeedInt num_elem,
141 const CeedInt elem_size, const CeedSize v_offset,
142 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {

143 // Restriction with (unsigned) tridiagonal transformation
144 CeedElemRestriction_Ref *impl;
145
146 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
147 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {

148 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {

148 for (CeedSize k = 0; k < num_comp; k++) {

149 CeedSize n = 0;
150
151 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
152 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
153 uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
154 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
155 uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
156 abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]);
157 }

158 CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {

158 for (n = 1; n < elem_size - 1; n++) {

159 CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
160 vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
161 uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
162 abs(impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size]) +
163 uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
164 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
165 uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
166 abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]);

--- 7 unchanged lines hidden (view full) ---

174 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]);
175 }
176 }
177 }
178 return CEED_ERROR_SUCCESS;
179}
180
181static inline int CeedElemRestrictionApplyStridedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,

182 CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size,
183 CeedSize v_offset, const CeedScalar *__restrict__ uu,
184 CeedScalar *__restrict__ vv) {

182 const CeedInt start, const CeedInt stop, const CeedInt num_elem,
183 const CeedInt elem_size, const CeedSize v_offset,
184 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {

185 // No offsets provided, identity restriction
186 bool has_backend_strides;
187
188 CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
189 if (has_backend_strides) {
190 // CPU backend strides are {1, elem_size, elem_size*num_comp}
191 // This if brach is left separate to allow better inlining
192 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {

193 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
194 CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {

193 for (CeedSize k = 0; k < num_comp; k++) {
194 for (CeedSize n = 0; n < elem_size; n++) {

195 CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) {
196 vv[n + k * elem_size + (e + j) * elem_size * num_comp] += uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset];
197 }
198 }
199 }
200 }
201 } else {
202 // User provided strides
203 CeedInt strides[3];
204
205 CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
206 for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {

207 CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
208 CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {

207 for (CeedSize k = 0; k < num_comp; k++) {
208 for (CeedSize n = 0; n < elem_size; n++) {

209 CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) {
210 vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] +=
211 uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset];
212 }
213 }
214 }
215 }
216 }
217 return CEED_ERROR_SUCCESS;
218}
219
220static inline int CeedElemRestrictionApplyOffsetTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,

221 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
222 CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
223 CeedScalar *__restrict__ vv) {

221 const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
222 const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
223 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {

224 // Default restriction with offsets
225 CeedElemRestriction_Ref *impl;
226
227 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
228 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
229 for (CeedSize k = 0; k < num_comp; k++) {
230 for (CeedSize i = 0; i < elem_size * block_size; i += block_size) {
231 // Iteration bound set to discard padding elements

--- 5 unchanged lines hidden (view full) ---

237 }
238 }
239 }
240 }
241 return CEED_ERROR_SUCCESS;
242}
243
244static inline int CeedElemRestrictionApplyOrientedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,

245 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
246 CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
247 CeedScalar *__restrict__ vv) {

245 const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
246 const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
247 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {

248 // Restriction with orientations
249 CeedElemRestriction_Ref *impl;
250
251 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
252 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
253 for (CeedSize k = 0; k < num_comp; k++) {
254 for (CeedSize i = 0; i < elem_size * block_size; i += block_size) {
255 // Iteration bound set to discard padding elements

--- 5 unchanged lines hidden (view full) ---

261 }
262 }
263 }
264 }
265 return CEED_ERROR_SUCCESS;
266}
267
268static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,

269 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
270 CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
271 CeedScalar *__restrict__ vv) {

269 const CeedInt comp_stride, const CeedInt start, const CeedInt stop,
270 const CeedInt num_elem, const CeedInt elem_size, const CeedSize v_offset,
271 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {

272 // Restriction with tridiagonal transformation
273 CeedElemRestriction_Ref *impl;
274 CeedScalar vv_loc[block_size];
275
276 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
277 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
278 for (CeedSize k = 0; k < num_comp; k++) {
279 // Iteration bound set to discard padding elements

--- 32 unchanged lines hidden (view full) ---

312 CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
313 }
314 }
315 }
316 return CEED_ERROR_SUCCESS;
317}
318
319static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp,

320 const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
321 CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset,

320 const CeedInt block_size, const CeedInt comp_stride,
321 const CeedInt start, const CeedInt stop, const CeedInt num_elem,
322 const CeedInt elem_size, const CeedSize v_offset,

322 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
323 // Restriction with (unsigned) tridiagonal transformation
324 CeedElemRestriction_Ref *impl;
325 CeedScalar vv_loc[block_size];
326
327 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
328 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
329 for (CeedSize k = 0; k < num_comp; k++) {

--- 32 unchanged lines hidden (view full) ---

362 for (CeedSize j = 0; j < block_end; j++) {
363 CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
364 }
365 }
366 }
367 return CEED_ERROR_SUCCESS;
368}
369

323 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
324 // Restriction with (unsigned) tridiagonal transformation
325 CeedElemRestriction_Ref *impl;
326 CeedScalar vv_loc[block_size];
327
328 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
329 for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
330 for (CeedSize k = 0; k < num_comp; k++) {

--- 32 unchanged lines hidden (view full) ---

363 for (CeedSize j = 0; j < block_end; j++) {
364 CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
365 }
366 }
367 }
368 return CEED_ERROR_SUCCESS;
369}
370

370static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, CeedInt start, CeedInt stop,
371 CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu,

371static inline int CeedElemRestrictionApplyAtPointsInElement_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt start,
372 const CeedInt stop, CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu,

372 CeedScalar *__restrict__ vv) {
373 CeedInt num_points, l_vec_offset;
374 CeedSize e_vec_offset = 0;
375 CeedElemRestriction_Ref *impl;
376
377 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
378 for (CeedInt e = start; e < stop; e++) {
379 l_vec_offset = impl->offsets[e];

--- 8 unchanged lines hidden (view full) ---

388 }
389 }
390 e_vec_offset += num_points * (CeedSize)num_comp;
391 }
392 return CEED_ERROR_SUCCESS;
393}
394
395static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,

373 CeedScalar *__restrict__ vv) {
374 CeedInt num_points, l_vec_offset;
375 CeedSize e_vec_offset = 0;
376 CeedElemRestriction_Ref *impl;
377
378 CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
379 for (CeedInt e = start; e < stop; e++) {
380 l_vec_offset = impl->offsets[e];

--- 8 unchanged lines hidden (view full) ---

389 }
390 }
391 e_vec_offset += num_points * (CeedSize)num_comp;
392 }
393 return CEED_ERROR_SUCCESS;
394}
395
396static inline int CeedElemRestrictionApply_Ref_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,

396 const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode, bool use_signs,
397 bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) {

397 const CeedInt comp_stride, const CeedInt start, const CeedInt stop, CeedTransposeMode t_mode,
398 bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) {