xref: /libCEED/rust/libceed-sys/c-src/backends/memcheck/ceed-memcheck-restriction.c (revision 740363ccb36551f3efa3d1612e26dcfe14a20099)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <stdbool.h>
11 #include <stdlib.h>
12 #include <string.h>
13 
14 #include "ceed-memcheck.h"
15 
16 //------------------------------------------------------------------------------
17 // Set backend strides
18 //------------------------------------------------------------------------------
19 static inline int CeedElemRestrictionGetBackendStrides_Memcheck(CeedElemRestriction rstr, CeedInt strides[3]) {
20   CeedInt elem_size, num_comp, num_elem;
21 
22   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
23   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
24   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
25   // Memcheck default, contiguous by component, then node
26   strides[0] = num_comp;
27   strides[1] = 1;
28   strides[2] = num_comp * elem_size;
29   /**
30       // CPU default, contiguous by node, then component
31       strides[0] = 1;
32       strides[1] = elem_size;
33       strides[2] = elem_size * num_comp;
34 
35       // GPU default, contiguous by node, then element
36       strides[0] = 1;
37       strides[1] = num_elem * elem_size;
38       strides[2] = elem_size;
39      **/
40   return CEED_ERROR_SUCCESS;
41 }
42 
43 //------------------------------------------------------------------------------
44 // Core ElemRestriction Apply Code
45 //------------------------------------------------------------------------------
46 static inline int CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
47                                                                            CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size,
48                                                                            CeedSize v_offset, const CeedScalar *__restrict__ uu,
49                                                                            CeedScalar *__restrict__ vv) {
50   // Get strides
51   bool    has_backend_strides;
52   CeedInt strides[3] = {0};
53 
54   CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
55   if (has_backend_strides) CeedCallBackend(CeedElemRestrictionGetBackendStrides_Memcheck(rstr, strides));
56   else CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
57 
58   // Apply restriction
59   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
60     CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) {
61       CeedPragmaSIMD for (CeedInt n = 0; n < elem_size; n++) {
62         CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
63           vv[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset] =
64               uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * strides[2]];
65         }
66       }
67     }
68   }
69   return CEED_ERROR_SUCCESS;
70 }
71 
72 static inline int CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
73                                                                           const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
74                                                                           CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
75                                                                           CeedScalar *__restrict__ vv) {
76   // Default restriction with offsets
77   CeedElemRestriction_Memcheck *impl;
78 
79   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
80   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
81     CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) {
82       CeedPragmaSIMD for (CeedInt i = 0; i < elem_size * block_size; i++) {
83         vv[elem_size * (k * (CeedSize)block_size + e * (CeedSize)num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride];
84       }
85     }
86   }
87   return CEED_ERROR_SUCCESS;
88 }
89 
90 static inline int CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp,
91                                                                             const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
92                                                                             CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset,
93                                                                             const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
94   // Restriction with orientations
95   CeedElemRestriction_Memcheck *impl;
96 
97   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
98   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
99     CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) {
100       CeedPragmaSIMD for (CeedInt i = 0; i < elem_size * block_size; i++) {
101         vv[elem_size * (k * (CeedSize)block_size + e * (CeedSize)num_comp) + i - v_offset] =
102             uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0);
103       }
104     }
105   }
106   return CEED_ERROR_SUCCESS;
107 }
108 
109 static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp,
110                                                                                 const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
111                                                                                 CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset,
112                                                                                 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
113   // Restriction with tridiagonal transformation
114   CeedElemRestriction_Memcheck *impl;
115 
116   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
117   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
118     CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) {
119       CeedInt n = 0;
120 
121       CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
122         vv[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset] =
123             uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
124                 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
125             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
126                 impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size];
127       }
128       CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
129         CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
130           vv[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset] =
131               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
132                   impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size] +
133               uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
134                   impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
135               uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
136                   impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size];
137         }
138       }
139       CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
140         vv[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset] =
141             uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
142                 impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size] +
143             uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
144                 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size];
145       }
146     }
147   }
148   return CEED_ERROR_SUCCESS;
149 }
150 
151 static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memcheck_Core(
152     CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop,
153     CeedInt num_elem, CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
154   // Restriction with (unsigned) tridiagonal transformation
155   CeedElemRestriction_Memcheck *impl;
156 
157   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
158   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
159     CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) {
160       CeedInt n = 0;
161 
162       CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
163         vv[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset] =
164             uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
165                 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
166             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
167                 abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]);
168       }
169       CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
170         CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
171           vv[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset] =
172               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
173                   abs(impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size]) +
174               uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
175                   abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
176               uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
177                   abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]);
178         }
179       }
180       CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
181         vv[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset] =
182             uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
183                 abs(impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size]) +
184             uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
185                 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]);
186       }
187     }
188   }
189   return CEED_ERROR_SUCCESS;
190 }
191 
192 static inline int CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
193                                                                          CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size,
194                                                                          CeedSize v_offset, const CeedScalar *__restrict__ uu,
195                                                                          CeedScalar *__restrict__ vv) {
196   // Get strides
197   bool    has_backend_strides;
198   CeedInt strides[3] = {0};
199 
200   CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
201   if (has_backend_strides) CeedCallBackend(CeedElemRestrictionGetBackendStrides_Memcheck(rstr, strides));
202   else CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
203 
204   // Apply restriction
205   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
206     CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) {
207       CeedPragmaSIMD for (CeedInt n = 0; n < elem_size; n++) {
208         CeedPragmaSIMD for (CeedInt j = 0; j < CeedIntMin(block_size, num_elem - e); j++) {
209           vv[n * (CeedSize)strides[0] + k * (CeedSize)strides[1] + (e + j) * (CeedSize)strides[2]] +=
210               uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset];
211         }
212       }
213     }
214   }
215   return CEED_ERROR_SUCCESS;
216 }
217 
218 static inline int CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
219                                                                         const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
220                                                                         CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
221                                                                         CeedScalar *__restrict__ vv) {
222   // Default restriction with offsets
223   CeedElemRestriction_Memcheck *impl;
224 
225   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
226   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
227     for (CeedInt k = 0; k < num_comp; k++) {
228       for (CeedInt i = 0; i < elem_size * block_size; i += block_size) {
229         // Iteration bound set to discard padding elements
230         for (CeedInt j = i; j < i + CeedIntMin(block_size, num_elem - e); j++) {
231           CeedScalar vv_loc;
232 
233           vv_loc = uu[elem_size * (k * (CeedSize)block_size + e * (CeedSize)num_comp) + j - v_offset];
234           CeedPragmaAtomic vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc;
235         }
236       }
237     }
238   }
239   return CEED_ERROR_SUCCESS;
240 }
241 
242 static inline int CeedElemRestrictionApplyOrientedTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
243                                                                           const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
244                                                                           CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
245                                                                           CeedScalar *__restrict__ vv) {
246   // Restriction with orientations
247   CeedElemRestriction_Memcheck *impl;
248 
249   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
250   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
251     for (CeedInt k = 0; k < num_comp; k++) {
252       for (CeedInt i = 0; i < elem_size * block_size; i += block_size) {
253         // Iteration bound set to discard padding elements
254         for (CeedInt j = i; j < i + CeedIntMin(block_size, num_elem - e); j++) {
255           CeedScalar vv_loc;
256 
257           vv_loc =
258               uu[elem_size * (k * (CeedSize)block_size + e * (CeedSize)num_comp) + j - v_offset] * (impl->orients[j + e * elem_size] ? -1.0 : 1.0);
259           CeedPragmaAtomic vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc;
260         }
261       }
262     }
263   }
264   return CEED_ERROR_SUCCESS;
265 }
266 
267 static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp,
268                                                                               const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
269                                                                               CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset,
270                                                                               const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
271   // Restriction with tridiagonal transformation
272   CeedElemRestriction_Memcheck *impl;
273   CeedScalar                    vv_loc[block_size];
274 
275   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
276   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
277     for (CeedInt k = 0; k < num_comp; k++) {
278       // Iteration bound set to discard padding elements
279       const CeedInt block_end = CeedIntMin(block_size, num_elem - e);
280       CeedInt       n         = 0;
281 
282       CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
283         vv_loc[j] = uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset] *
284                         impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
285                     uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n + 1) * block_size + j - v_offset] *
286                         impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size];
287       }
288       for (CeedInt j = 0; j < block_end; j++) {
289         CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
290       }
291       for (n = 1; n < elem_size - 1; n++) {
292         CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
293           vv_loc[j] = uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n - 1) * block_size + j - v_offset] *
294                           impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size] +
295                       uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset] *
296                           impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
297                       uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n + 1) * block_size + j - v_offset] *
298                           impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size];
299         }
300         for (CeedInt j = 0; j < block_end; j++) {
301           CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
302         }
303       }
304       CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
305         vv_loc[j] = uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n - 1) * block_size + j - v_offset] *
306                         impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size] +
307                     uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset] *
308                         impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size];
309       }
310       for (CeedInt j = 0; j < block_end; j++) {
311         CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
312       }
313     }
314   }
315   return CEED_ERROR_SUCCESS;
316 }
317 
318 static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Memcheck_Core(
319     CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop,
320     CeedInt num_elem, CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
321   // Restriction with (unsigned) tridiagonal transformation
322   CeedElemRestriction_Memcheck *impl;
323   CeedScalar                    vv_loc[block_size];
324 
325   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
326   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
327     for (CeedInt k = 0; k < num_comp; k++) {
328       // Iteration bound set to discard padding elements
329       const CeedInt block_end = CeedIntMin(block_size, num_elem - e);
330       CeedInt       n         = 0;
331 
332       CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
333         vv_loc[j] = uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset] *
334                         abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
335                     uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n + 1) * block_size + j - v_offset] *
336                         abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]);
337       }
338       for (CeedInt j = 0; j < block_end; j++) {
339         CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
340       }
341       for (n = 1; n < elem_size - 1; n++) {
342         CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
343           vv_loc[j] = uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n - 1) * block_size + j - v_offset] *
344                           abs(impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size]) +
345                       uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset] *
346                           abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
347                       uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n + 1) * block_size + j - v_offset] *
348                           abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]);
349         }
350         for (CeedInt j = 0; j < block_end; j++) {
351           CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
352         }
353       }
354       CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
355         vv_loc[j] = uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n - 1) * block_size + j - v_offset] *
356                         abs(impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size]) +
357                     uu[e * elem_size * (CeedSize)num_comp + (k * (CeedSize)elem_size + n) * block_size + j - v_offset] *
358                         abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]);
359       }
360       for (CeedInt j = 0; j < block_end; j++) {
361         CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
362       }
363     }
364   }
365   return CEED_ERROR_SUCCESS;
366 }
367 
368 static inline int CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, CeedInt start,
369                                                                           CeedInt stop, CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu,
370                                                                           CeedScalar *__restrict__ vv) {
371   CeedInt                       num_points, l_vec_offset;
372   CeedSize                      e_vec_offset = 0;
373   CeedElemRestriction_Memcheck *impl;
374 
375   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
376   for (CeedInt e = start; e < stop; e++) {
377     l_vec_offset = impl->offsets[e];
378     CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr, e, &num_points));
379     if (t_mode == CEED_NOTRANSPOSE) {
380       for (CeedInt i = 0; i < num_points; i++) {
381         for (CeedInt j = 0; j < num_comp; j++) vv[j * (CeedSize)num_points + i + e_vec_offset] = uu[impl->offsets[i + l_vec_offset] * num_comp + j];
382       }
383     } else {
384       for (CeedInt i = 0; i < num_points; i++) {
385         for (CeedInt j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] = uu[j * (CeedSize)num_points + i + e_vec_offset];
386       }
387     }
388     e_vec_offset += num_points * (CeedSize)num_comp;
389   }
390   return CEED_ERROR_SUCCESS;
391 }
392 
393 static inline int CeedElemRestrictionApply_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
394                                                          const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode,
395                                                          bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) {
396   CeedInt             num_elem, elem_size;
397   CeedSize            v_offset;
398   CeedRestrictionType rstr_type;
399   const CeedScalar   *uu;
400   CeedScalar         *vv;
401 
402   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
403   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
404   v_offset = start * block_size * elem_size * (CeedSize)num_comp;
405   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
406   CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu));
407 
408   if (t_mode == CEED_TRANSPOSE) {
409     // Sum into for transpose mode, E-vector to L-vector
410     CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_HOST, &vv));
411   } else {
412     // Overwrite for notranspose mode, L-vector to E-vector
413     CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_HOST, &vv));
414   }
415 
416   if (t_mode == CEED_TRANSPOSE) {
417     // Restriction from E-vector to L-vector
418     // Performing v += r^T * u
419     // uu has shape [elem_size, num_comp, num_elem], row-major
420     // vv has shape [nnodes, num_comp]
421     // Sum into for transpose mode
422     switch (rstr_type) {
423       case CEED_RESTRICTION_STRIDED:
424         CeedCallBackend(
425             CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
426         break;
427       case CEED_RESTRICTION_STANDARD:
428         CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
429                                                                               elem_size, v_offset, uu, vv));
430         break;
431       case CEED_RESTRICTION_ORIENTED:
432         if (use_signs) {
433           CeedCallBackend(CeedElemRestrictionApplyOrientedTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
434                                                                                   elem_size, v_offset, uu, vv));
435         } else {
436           CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
437                                                                                 elem_size, v_offset, uu, vv));
438         }
439         break;
440       case CEED_RESTRICTION_CURL_ORIENTED:
441         if (use_signs && use_orients) {
442           CeedCallBackend(CeedElemRestrictionApplyCurlOrientedTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
443                                                                                       elem_size, v_offset, uu, vv));
444         } else if (use_orients) {
445           CeedCallBackend(CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop,
446                                                                                               num_elem, elem_size, v_offset, uu, vv));
447         } else {
448           CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
449                                                                                 elem_size, v_offset, uu, vv));
450         }
451         break;
452       case CEED_RESTRICTION_POINTS:
453         CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(rstr, num_comp, start, stop, t_mode, uu, vv));
454         break;
455     }
456   } else {
457     // Restriction from L-vector to E-vector
458     // Perform: v = r * u
459     // vv has shape [elem_size, num_comp, num_elem], row-major
460     // uu has shape [nnodes, num_comp]
461     // Overwrite for notranspose mode
462     switch (rstr_type) {
463       case CEED_RESTRICTION_STRIDED:
464         CeedCallBackend(
465             CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
466         break;
467       case CEED_RESTRICTION_STANDARD:
468         CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
469                                                                                 elem_size, v_offset, uu, vv));
470         break;
471       case CEED_RESTRICTION_ORIENTED:
472         if (use_signs) {
473           CeedCallBackend(CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
474                                                                                     elem_size, v_offset, uu, vv));
475         } else {
476           CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
477                                                                                   elem_size, v_offset, uu, vv));
478         }
479         break;
480       case CEED_RESTRICTION_CURL_ORIENTED:
481         if (use_signs && use_orients) {
482           CeedCallBackend(CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop,
483                                                                                         num_elem, elem_size, v_offset, uu, vv));
484         } else if (use_orients) {
485           CeedCallBackend(CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop,
486                                                                                                 num_elem, elem_size, v_offset, uu, vv));
487         } else {
488           CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
489                                                                                   elem_size, v_offset, uu, vv));
490         }
491         break;
492       case CEED_RESTRICTION_POINTS:
493         CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(rstr, num_comp, start, stop, t_mode, uu, vv));
494         break;
495     }
496   }
497   CeedCallBackend(CeedVectorRestoreArrayRead(u, &uu));
498   CeedCallBackend(CeedVectorRestoreArray(v, &vv));
499   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL;
500   return CEED_ERROR_SUCCESS;
501 }
502 
503 //------------------------------------------------------------------------------
504 // ElemRestriction Apply
505 //------------------------------------------------------------------------------
506 static int CeedElemRestrictionApply_Memcheck(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) {
507   CeedInt                       num_block, block_size, num_comp, comp_stride;
508   CeedElemRestriction_Memcheck *impl;
509 
510   CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block));
511   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
512   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
513   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
514   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
515   CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, 0, num_block, t_mode, true, true, u, v, request));
516   return CEED_ERROR_SUCCESS;
517 }
518 
519 //------------------------------------------------------------------------------
520 // ElemRestriction Apply Unsigned
521 //------------------------------------------------------------------------------
522 static int CeedElemRestrictionApplyUnsigned_Memcheck(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v,
523                                                      CeedRequest *request) {
524   CeedInt                       num_block, block_size, num_comp, comp_stride;
525   CeedElemRestriction_Memcheck *impl;
526 
527   CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block));
528   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
529   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
530   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
531   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
532   CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, 0, num_block, t_mode, false, true, u, v, request));
533   return CEED_ERROR_SUCCESS;
534 }
535 
536 //------------------------------------------------------------------------------
537 // ElemRestriction Apply Unoriented
538 //------------------------------------------------------------------------------
539 static int CeedElemRestrictionApplyUnoriented_Memcheck(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v,
540                                                        CeedRequest *request) {
541   CeedInt                       num_block, block_size, num_comp, comp_stride;
542   CeedElemRestriction_Memcheck *impl;
543 
544   CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block));
545   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
546   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
547   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
548   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
549   CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, 0, num_block, t_mode, false, false, u, v, request));
550   return CEED_ERROR_SUCCESS;
551 }
552 
553 //------------------------------------------------------------------------------
554 // ElemRestriction Apply Points
555 //------------------------------------------------------------------------------
556 static int CeedElemRestrictionApplyAtPointsInElement_Memcheck(CeedElemRestriction rstr, CeedInt elem, CeedTransposeMode t_mode, CeedVector u,
557                                                               CeedVector v, CeedRequest *request) {
558   CeedInt                       num_comp;
559   CeedElemRestriction_Memcheck *impl;
560 
561   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
562   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
563   return impl->Apply(rstr, num_comp, 0, 1, elem, elem + 1, t_mode, false, false, u, v, request);
564 }
565 
566 //------------------------------------------------------------------------------
567 // ElemRestriction Apply Block
568 //------------------------------------------------------------------------------
569 static int CeedElemRestrictionApplyBlock_Memcheck(CeedElemRestriction rstr, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector v,
570                                                   CeedRequest *request) {
571   CeedInt                       block_size, num_comp, comp_stride;
572   CeedElemRestriction_Memcheck *impl;
573 
574   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
575   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
576   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
577   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
578   CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, block, block + 1, t_mode, true, true, u, v, request));
579   return CEED_ERROR_SUCCESS;
580 }
581 
582 //------------------------------------------------------------------------------
583 // ElemRestriction Get Offsets
584 //------------------------------------------------------------------------------
585 static int CeedElemRestrictionGetOffsets_Memcheck(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) {
586   CeedElemRestriction_Memcheck *impl;
587 
588   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
589 
590   CeedCheck(mem_type == CEED_MEM_HOST, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_BACKEND, "Can only provide to HOST memory");
591 
592   *offsets = impl->offsets;
593   return CEED_ERROR_SUCCESS;
594 }
595 
596 //------------------------------------------------------------------------------
597 // ElemRestriction Get Orientations
598 //------------------------------------------------------------------------------
599 static int CeedElemRestrictionGetOrientations_Memcheck(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) {
600   CeedElemRestriction_Memcheck *impl;
601 
602   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
603 
604   CeedCheck(mem_type == CEED_MEM_HOST, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_BACKEND, "Can only provide to HOST memory");
605 
606   *orients = impl->orients;
607   return CEED_ERROR_SUCCESS;
608 }
609 
610 //------------------------------------------------------------------------------
611 // ElemRestriction Get Curl-Conforming Orientations
612 //------------------------------------------------------------------------------
613 static int CeedElemRestrictionGetCurlOrientations_Memcheck(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) {
614   CeedElemRestriction_Memcheck *impl;
615 
616   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
617 
618   CeedCheck(mem_type == CEED_MEM_HOST, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_BACKEND, "Can only provide to HOST memory");
619 
620   *curl_orients = impl->curl_orients;
621   return CEED_ERROR_SUCCESS;
622 }
623 
624 //------------------------------------------------------------------------------
625 // ElemRestriction Destroy
626 //------------------------------------------------------------------------------
627 static int CeedElemRestrictionDestroy_Memcheck(CeedElemRestriction rstr) {
628   CeedElemRestriction_Memcheck *impl;
629 
630   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
631   CeedCallBackend(CeedFree(&impl->offsets_allocated));
632   CeedCallBackend(CeedFree(&impl->orients_allocated));
633   CeedCallBackend(CeedFree(&impl->curl_orients_allocated));
634   CeedCallBackend(CeedFree(&impl));
635   return CEED_ERROR_SUCCESS;
636 }
637 
638 //------------------------------------------------------------------------------
639 // ElemRestriction Create
640 //------------------------------------------------------------------------------
641 int CeedElemRestrictionCreate_Memcheck(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, const bool *orients,
642                                        const CeedInt8 *curl_orients, CeedElemRestriction rstr) {
643   Ceed                          ceed;
644   CeedInt                       num_elem, elem_size, num_block, block_size, num_comp, comp_stride, num_points = 0, num_offsets;
645   CeedRestrictionType           rstr_type;
646   CeedElemRestriction_Memcheck *impl;
647 
648   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
649   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
650   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
651   CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block));
652   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
653   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
654   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
655   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
656 
657   CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Only MemType = HOST supported");
658 
659   CeedCallBackend(CeedCalloc(1, &impl));
660   CeedCallBackend(CeedElemRestrictionSetData(rstr, impl));
661 
662   // Set layouts
663   {
664     bool    has_backend_strides;
665     CeedInt e_layout[3] = {1, elem_size, elem_size * num_comp}, l_layout[3] = {0};
666 
667     CeedCallBackend(CeedElemRestrictionSetELayout(rstr, e_layout));
668     if (rstr_type == CEED_RESTRICTION_STRIDED) {
669       CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
670       if (has_backend_strides) {
671         CeedCallBackend(CeedElemRestrictionGetBackendStrides_Memcheck(rstr, l_layout));
672         CeedCallBackend(CeedElemRestrictionSetLLayout(rstr, l_layout));
673       }
674     }
675   }
676 
677   // Offsets data
678   if (rstr_type != CEED_RESTRICTION_STRIDED) {
679     const char *resource;
680 
681     // Check indices for ref or memcheck backends
682     {
683       Ceed current = ceed, parent = NULL;
684 
685       CeedCallBackend(CeedGetParent(current, &parent));
686       while (current != parent) {
687         current = parent;
688         CeedCallBackend(CeedGetParent(current, &parent));
689       }
690       CeedCallBackend(CeedGetResource(parent, &resource));
691     }
692     if (!strcmp(resource, "/cpu/self/ref/serial") || !strcmp(resource, "/cpu/self/ref/blocked") || !strcmp(resource, "/cpu/self/memcheck/serial") ||
693         !strcmp(resource, "/cpu/self/memcheck/blocked")) {
694       CeedSize l_size;
695 
696       CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
697       for (CeedInt i = 0; i < num_elem * elem_size; i++) {
698         CeedCheck(offsets[i] >= 0 && offsets[i] + (num_comp - 1) * comp_stride < l_size, ceed, CEED_ERROR_BACKEND,
699                   "Restriction offset %" CeedInt_FMT " (%" CeedInt_FMT ") out of range [0, %" CeedInt_FMT "]", i, offsets[i], l_size);
700       }
701     }
702 
703     // Copy data
704     if (rstr_type == CEED_RESTRICTION_POINTS) CeedCallBackend(CeedElemRestrictionGetNumPoints(rstr, &num_points));
705     num_offsets = rstr_type == CEED_RESTRICTION_POINTS ? (num_elem + 1 + num_points) : (num_elem * elem_size);
706     switch (copy_mode) {
707       case CEED_COPY_VALUES:
708         CeedCallBackend(CeedMalloc(num_offsets, &impl->offsets_allocated));
709         memcpy(impl->offsets_allocated, offsets, num_offsets * sizeof(offsets[0]));
710         impl->offsets = impl->offsets_allocated;
711         break;
712       case CEED_OWN_POINTER:
713         impl->offsets_allocated = (CeedInt *)offsets;
714         impl->offsets           = impl->offsets_allocated;
715         break;
716       case CEED_USE_POINTER:
717         impl->offsets = offsets;
718     }
719 
720     // Orientation data
721     if (rstr_type == CEED_RESTRICTION_ORIENTED) {
722       CeedCheck(orients != NULL, ceed, CEED_ERROR_BACKEND, "No orients array provided for oriented restriction");
723       switch (copy_mode) {
724         case CEED_COPY_VALUES:
725           CeedCallBackend(CeedMalloc(num_offsets, &impl->orients_allocated));
726           memcpy(impl->orients_allocated, orients, num_offsets * sizeof(orients[0]));
727           impl->orients = impl->orients_allocated;
728           break;
729         case CEED_OWN_POINTER:
730           impl->orients_allocated = (bool *)orients;
731           impl->orients           = impl->orients_allocated;
732           break;
733         case CEED_USE_POINTER:
734           impl->orients = orients;
735       }
736     } else if (rstr_type == CEED_RESTRICTION_CURL_ORIENTED) {
737       CeedCheck(curl_orients != NULL, ceed, CEED_ERROR_BACKEND, "No curl_orients array provided for oriented restriction");
738       switch (copy_mode) {
739         case CEED_COPY_VALUES:
740           CeedCallBackend(CeedMalloc(3 * num_offsets, &impl->curl_orients_allocated));
741           memcpy(impl->curl_orients_allocated, curl_orients, 3 * num_offsets * sizeof(curl_orients[0]));
742           impl->curl_orients = impl->curl_orients_allocated;
743           break;
744         case CEED_OWN_POINTER:
745           impl->curl_orients_allocated = (CeedInt8 *)curl_orients;
746           impl->curl_orients           = impl->curl_orients_allocated;
747           break;
748         case CEED_USE_POINTER:
749           impl->curl_orients = curl_orients;
750       }
751     }
752   }
753 
754   // Set apply function
755   impl->Apply = CeedElemRestrictionApply_Memcheck_Core;
756 
757   // Register backend functions
758   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Memcheck));
759   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Memcheck));
760   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Memcheck));
761   if (rstr_type == CEED_RESTRICTION_POINTS) {
762     CeedCallBackend(
763         CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement", CeedElemRestrictionApplyAtPointsInElement_Memcheck));
764   }
765   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyBlock", CeedElemRestrictionApplyBlock_Memcheck));
766   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Memcheck));
767   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Memcheck));
768   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Memcheck));
769   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Memcheck));
770   return CEED_ERROR_SUCCESS;
771 }
772 
773 //------------------------------------------------------------------------------
774