xref: /libCEED/backends/memcheck/ceed-memcheck-restriction.c (revision 0c9ac183098ed55d0b8f535d4ee42b5752ebbe4d)
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <stdbool.h>
11 #include <stdlib.h>
12 #include <string.h>
13 
14 #include "ceed-memcheck.h"
15 
16 //------------------------------------------------------------------------------
17 // Set backend strides
18 //------------------------------------------------------------------------------
19 static inline int CeedElemRestrictionGetBackendStrides_Memcheck(CeedElemRestriction rstr, CeedInt strides[3]) {
20   CeedInt elem_size, num_comp, num_elem;
21 
22   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
23   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
24   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
25   // Memcheck default, contiguous by component, then node
26   strides[0] = num_comp;
27   strides[1] = 1;
28   strides[2] = num_comp * elem_size;
29   /**
30       // CPU default, contiguous by node, then component
31       strides[0] = 1;
32       strides[1] = elem_size;
33       strides[2] = elem_size * num_comp;
34 
35       // GPU default, contiguous by node, then element
36       strides[0] = 1;
37       strides[1] = num_elem * elem_size;
38       strides[2] = elem_size;
39      **/
40   return CEED_ERROR_SUCCESS;
41 }
42 
43 //------------------------------------------------------------------------------
44 // Core ElemRestriction Apply Code
45 //------------------------------------------------------------------------------
46 static inline int CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
47                                                                            CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size,
48                                                                            CeedInt v_offset, const CeedScalar *__restrict__ uu,
49                                                                            CeedScalar *__restrict__ vv) {
50   // Get strides
51   bool    has_backend_strides;
52   CeedInt strides[3] = {0};
53 
54   CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
55   if (has_backend_strides) CeedCallBackend(CeedElemRestrictionGetBackendStrides_Memcheck(rstr, strides));
56   else CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
57 
58   // Apply restriction
59   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
60     CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) {
61       CeedPragmaSIMD for (CeedInt n = 0; n < elem_size; n++) {
62         CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
63           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
64               uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * strides[2]];
65         }
66       }
67     }
68   }
69   return CEED_ERROR_SUCCESS;
70 }
71 
72 static inline int CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
73                                                                           const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
74                                                                           CeedInt elem_size, CeedInt v_offset, const CeedScalar *__restrict__ uu,
75                                                                           CeedScalar *__restrict__ vv) {
76   // Default restriction with offsets
77   CeedElemRestriction_Memcheck *impl;
78 
79   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
80   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
81     CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) {
82       CeedPragmaSIMD for (CeedInt i = 0; i < elem_size * block_size; i++) {
83         vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride];
84       }
85     }
86   }
87   return CEED_ERROR_SUCCESS;
88 }
89 
90 static inline int CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp,
91                                                                             const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
92                                                                             CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedInt v_offset,
93                                                                             const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
94   // Restriction with orientations
95   CeedElemRestriction_Memcheck *impl;
96 
97   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
98   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
99     CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) {
100       CeedPragmaSIMD for (CeedInt i = 0; i < elem_size * block_size; i++) {
101         vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] =
102             uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0);
103       }
104     }
105   }
106   return CEED_ERROR_SUCCESS;
107 }
108 
109 static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp,
110                                                                                 const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
111                                                                                 CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedInt v_offset,
112                                                                                 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
113   // Restriction with tridiagonal transformation
114   CeedElemRestriction_Memcheck *impl;
115 
116   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
117   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
118     CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) {
119       CeedInt n = 0;
120 
121       CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
122         vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
123             uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
124                 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
125             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
126                 impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size];
127       }
128       CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
129         CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
130           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
131               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
132                   impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size] +
133               uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
134                   impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
135               uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
136                   impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size];
137         }
138       }
139       CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
140         vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
141             uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
142                 impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size] +
143             uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
144                 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size];
145       }
146     }
147   }
148   return CEED_ERROR_SUCCESS;
149 }
150 
151 static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memcheck_Core(
152     CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop,
153     CeedInt num_elem, CeedInt elem_size, CeedInt v_offset, const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
154   // Restriction with (unsigned) tridiagonal transformation
155   CeedElemRestriction_Memcheck *impl;
156 
157   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
158   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
159     CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) {
160       CeedInt n = 0;
161 
162       CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
163         vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
164             uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
165                 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
166             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
167                 abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]);
168       }
169       CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
170         CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
171           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
172               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
173                   abs(impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size]) +
174               uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
175                   abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
176               uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
177                   abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]);
178         }
179       }
180       CeedPragmaSIMD for (CeedInt j = 0; j < block_size; j++) {
181         vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
182             uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
183                 abs(impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size]) +
184             uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
185                 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]);
186       }
187     }
188   }
189   return CEED_ERROR_SUCCESS;
190 }
191 
192 static inline int CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
193                                                                          CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size,
194                                                                          CeedInt v_offset, const CeedScalar *__restrict__ uu,
195                                                                          CeedScalar *__restrict__ vv) {
196   // Get strides
197   bool    has_backend_strides;
198   CeedInt strides[3] = {0};
199 
200   CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
201   if (has_backend_strides) CeedCallBackend(CeedElemRestrictionGetBackendStrides_Memcheck(rstr, strides));
202   else CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
203 
204   // Apply restriction
205   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
206     CeedPragmaSIMD for (CeedInt k = 0; k < num_comp; k++) {
207       CeedPragmaSIMD for (CeedInt n = 0; n < elem_size; n++) {
208         CeedPragmaSIMD for (CeedInt j = 0; j < CeedIntMin(block_size, num_elem - e); j++) {
209           vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] +=
210               uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset];
211         }
212       }
213     }
214   }
215   return CEED_ERROR_SUCCESS;
216 }
217 
218 static inline int CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
219                                                                         const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
220                                                                         CeedInt elem_size, CeedInt v_offset, const CeedScalar *__restrict__ uu,
221                                                                         CeedScalar *__restrict__ vv) {
222   // Default restriction with offsets
223   CeedElemRestriction_Memcheck *impl;
224 
225   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
226   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
227     for (CeedInt k = 0; k < num_comp; k++) {
228       for (CeedInt i = 0; i < elem_size * block_size; i += block_size) {
229         // Iteration bound set to discard padding elements
230         for (CeedInt j = i; j < i + CeedIntMin(block_size, num_elem - e); j++) {
231           CeedScalar vv_loc;
232 
233           vv_loc = uu[elem_size * (k * block_size + e * num_comp) + j - v_offset];
234           CeedPragmaAtomic vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc;
235         }
236       }
237     }
238   }
239   return CEED_ERROR_SUCCESS;
240 }
241 
242 static inline int CeedElemRestrictionApplyOrientedTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
243                                                                           const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
244                                                                           CeedInt elem_size, CeedInt v_offset, const CeedScalar *__restrict__ uu,
245                                                                           CeedScalar *__restrict__ vv) {
246   // Restriction with orientations
247   CeedElemRestriction_Memcheck *impl;
248 
249   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
250   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
251     for (CeedInt k = 0; k < num_comp; k++) {
252       for (CeedInt i = 0; i < elem_size * block_size; i += block_size) {
253         // Iteration bound set to discard padding elements
254         for (CeedInt j = i; j < i + CeedIntMin(block_size, num_elem - e); j++) {
255           CeedScalar vv_loc;
256 
257           vv_loc = uu[elem_size * (k * block_size + e * num_comp) + j - v_offset] * (impl->orients[j + e * elem_size] ? -1.0 : 1.0);
258           CeedPragmaAtomic vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc;
259         }
260       }
261     }
262   }
263   return CEED_ERROR_SUCCESS;
264 }
265 
266 static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp,
267                                                                               const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
268                                                                               CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedInt v_offset,
269                                                                               const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
270   // Restriction with tridiagonal transformation
271   CeedElemRestriction_Memcheck *impl;
272   CeedScalar                    vv_loc[block_size];
273 
274   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
275   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
276     for (CeedInt k = 0; k < num_comp; k++) {
277       // Iteration bound set to discard padding elements
278       const CeedInt block_end = CeedIntMin(block_size, num_elem - e);
279       CeedInt       n         = 0;
280 
281       CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
282         vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] *
283                         impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
284                     uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] *
285                         impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size];
286       }
287       for (CeedInt j = 0; j < block_end; j++) {
288         CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
289       }
290       for (n = 1; n < elem_size - 1; n++) {
291         CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
292           vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] *
293                           impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size] +
294                       uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] *
295                           impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
296                       uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] *
297                           impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size];
298         }
299         for (CeedInt j = 0; j < block_end; j++) {
300           CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
301         }
302       }
303       CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
304         vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] *
305                         impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size] +
306                     uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] *
307                         impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size];
308       }
309       for (CeedInt j = 0; j < block_end; j++) {
310         CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
311       }
312     }
313   }
314   return CEED_ERROR_SUCCESS;
315 }
316 
317 static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Memcheck_Core(
318     CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop,
319     CeedInt num_elem, CeedInt elem_size, CeedInt v_offset, const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
320   // Restriction with (unsigned) tridiagonal transformation
321   CeedElemRestriction_Memcheck *impl;
322   CeedScalar                    vv_loc[block_size];
323 
324   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
325   for (CeedInt e = start * block_size; e < stop * block_size; e += block_size) {
326     for (CeedInt k = 0; k < num_comp; k++) {
327       // Iteration bound set to discard padding elements
328       const CeedInt block_end = CeedIntMin(block_size, num_elem - e);
329       CeedInt       n         = 0;
330 
331       CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
332         vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] *
333                         abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
334                     uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] *
335                         abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]);
336       }
337       for (CeedInt j = 0; j < block_end; j++) {
338         CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
339       }
340       for (n = 1; n < elem_size - 1; n++) {
341         CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
342           vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] *
343                           abs(impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size]) +
344                       uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] *
345                           abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
346                       uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] *
347                           abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]);
348         }
349         for (CeedInt j = 0; j < block_end; j++) {
350           CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
351         }
352       }
353       CeedPragmaSIMD for (CeedInt j = 0; j < block_end; j++) {
354         vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] *
355                         abs(impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size]) +
356                     uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] *
357                         abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]);
358       }
359       for (CeedInt j = 0; j < block_end; j++) {
360         CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
361       }
362     }
363   }
364   return CEED_ERROR_SUCCESS;
365 }
366 
367 static inline int CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, CeedInt start,
368                                                                           CeedInt stop, CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu,
369                                                                           CeedScalar *__restrict__ vv) {
370   CeedInt                       num_points, l_vec_offset, e_vec_offset = 0;
371   CeedElemRestriction_Memcheck *impl;
372 
373   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
374   for (CeedInt e = start; e < stop; e++) {
375     l_vec_offset = impl->offsets[e];
376     CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr, e, &num_points));
377     if (t_mode == CEED_NOTRANSPOSE) {
378       for (CeedInt i = 0; i < num_points; i++) {
379         for (CeedInt j = 0; j < num_comp; j++) vv[j * num_points + i + e_vec_offset] = uu[impl->offsets[i + l_vec_offset] * num_comp + j];
380       }
381     } else {
382       for (CeedInt i = 0; i < num_points; i++) {
383         for (CeedInt j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] = uu[j * num_points + i + e_vec_offset];
384       }
385     }
386     e_vec_offset += num_points * num_comp;
387   }
388   return CEED_ERROR_SUCCESS;
389 }
390 
391 static inline int CeedElemRestrictionApply_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
392                                                          const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode,
393                                                          bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) {
394   CeedInt             num_elem, elem_size, v_offset;
395   CeedRestrictionType rstr_type;
396   const CeedScalar   *uu;
397   CeedScalar         *vv;
398 
399   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
400   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
401   v_offset = start * block_size * elem_size * num_comp;
402   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
403   CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu));
404 
405   if (t_mode == CEED_TRANSPOSE) {
406     // Sum into for transpose mode, E-vector to L-vector
407     CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_HOST, &vv));
408   } else {
409     // Overwrite for notranspose mode, L-vector to E-vector
410     CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_HOST, &vv));
411   }
412 
413   if (t_mode == CEED_TRANSPOSE) {
414     // Restriction from E-vector to L-vector
415     // Performing v += r^T * u
416     // uu has shape [elem_size, num_comp, num_elem], row-major
417     // vv has shape [nnodes, num_comp]
418     // Sum into for transpose mode
419     switch (rstr_type) {
420       case CEED_RESTRICTION_STRIDED:
421         CeedCallBackend(
422             CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
423         break;
424       case CEED_RESTRICTION_STANDARD:
425         CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
426                                                                               elem_size, v_offset, uu, vv));
427         break;
428       case CEED_RESTRICTION_ORIENTED:
429         if (use_signs) {
430           CeedCallBackend(CeedElemRestrictionApplyOrientedTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
431                                                                                   elem_size, v_offset, uu, vv));
432         } else {
433           CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
434                                                                                 elem_size, v_offset, uu, vv));
435         }
436         break;
437       case CEED_RESTRICTION_CURL_ORIENTED:
438         if (use_signs && use_orients) {
439           CeedCallBackend(CeedElemRestrictionApplyCurlOrientedTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
440                                                                                       elem_size, v_offset, uu, vv));
441         } else if (use_orients) {
442           CeedCallBackend(CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop,
443                                                                                               num_elem, elem_size, v_offset, uu, vv));
444         } else {
445           CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
446                                                                                 elem_size, v_offset, uu, vv));
447         }
448         break;
449       case CEED_RESTRICTION_POINTS:
450         CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(rstr, num_comp, start, stop, t_mode, uu, vv));
451         break;
452     }
453   } else {
454     // Restriction from L-vector to E-vector
455     // Perform: v = r * u
456     // vv has shape [elem_size, num_comp, num_elem], row-major
457     // uu has shape [nnodes, num_comp]
458     // Overwrite for notranspose mode
459     switch (rstr_type) {
460       case CEED_RESTRICTION_STRIDED:
461         CeedCallBackend(
462             CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
463         break;
464       case CEED_RESTRICTION_STANDARD:
465         CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
466                                                                                 elem_size, v_offset, uu, vv));
467         break;
468       case CEED_RESTRICTION_ORIENTED:
469         if (use_signs) {
470           CeedCallBackend(CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
471                                                                                     elem_size, v_offset, uu, vv));
472         } else {
473           CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
474                                                                                   elem_size, v_offset, uu, vv));
475         }
476         break;
477       case CEED_RESTRICTION_CURL_ORIENTED:
478         if (use_signs && use_orients) {
479           CeedCallBackend(CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop,
480                                                                                         num_elem, elem_size, v_offset, uu, vv));
481         } else if (use_orients) {
482           CeedCallBackend(CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop,
483                                                                                                 num_elem, elem_size, v_offset, uu, vv));
484         } else {
485           CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
486                                                                                   elem_size, v_offset, uu, vv));
487         }
488         break;
489       case CEED_RESTRICTION_POINTS:
490         CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(rstr, num_comp, start, stop, t_mode, uu, vv));
491         break;
492     }
493   }
494   CeedCallBackend(CeedVectorRestoreArrayRead(u, &uu));
495   CeedCallBackend(CeedVectorRestoreArray(v, &vv));
496   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL;
497   return CEED_ERROR_SUCCESS;
498 }
499 
500 //------------------------------------------------------------------------------
501 // ElemRestriction Apply
502 //------------------------------------------------------------------------------
503 static int CeedElemRestrictionApply_Memcheck(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) {
504   CeedInt                       num_block, block_size, num_comp, comp_stride;
505   CeedElemRestriction_Memcheck *impl;
506 
507   CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block));
508   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
509   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
510   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
511   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
512   CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, 0, num_block, t_mode, true, true, u, v, request));
513   return CEED_ERROR_SUCCESS;
514 }
515 
516 //------------------------------------------------------------------------------
517 // ElemRestriction Apply Unsigned
518 //------------------------------------------------------------------------------
519 static int CeedElemRestrictionApplyUnsigned_Memcheck(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v,
520                                                      CeedRequest *request) {
521   CeedInt                       num_block, block_size, num_comp, comp_stride;
522   CeedElemRestriction_Memcheck *impl;
523 
524   CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block));
525   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
526   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
527   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
528   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
529   CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, 0, num_block, t_mode, false, true, u, v, request));
530   return CEED_ERROR_SUCCESS;
531 }
532 
533 //------------------------------------------------------------------------------
534 // ElemRestriction Apply Unoriented
535 //------------------------------------------------------------------------------
536 static int CeedElemRestrictionApplyUnoriented_Memcheck(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v,
537                                                        CeedRequest *request) {
538   CeedInt                       num_block, block_size, num_comp, comp_stride;
539   CeedElemRestriction_Memcheck *impl;
540 
541   CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block));
542   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
543   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
544   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
545   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
546   CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, 0, num_block, t_mode, false, false, u, v, request));
547   return CEED_ERROR_SUCCESS;
548 }
549 
550 //------------------------------------------------------------------------------
551 // ElemRestriction Apply Points
552 //------------------------------------------------------------------------------
553 static int CeedElemRestrictionApplyAtPointsInElement_Memcheck(CeedElemRestriction rstr, CeedInt elem, CeedTransposeMode t_mode, CeedVector u,
554                                                               CeedVector v, CeedRequest *request) {
555   CeedInt                       num_comp;
556   CeedElemRestriction_Memcheck *impl;
557 
558   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
559   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
560   return impl->Apply(rstr, num_comp, 0, 1, elem, elem + 1, t_mode, false, false, u, v, request);
561 }
562 
563 //------------------------------------------------------------------------------
564 // ElemRestriction Apply Block
565 //------------------------------------------------------------------------------
566 static int CeedElemRestrictionApplyBlock_Memcheck(CeedElemRestriction rstr, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector v,
567                                                   CeedRequest *request) {
568   CeedInt                       block_size, num_comp, comp_stride;
569   CeedElemRestriction_Memcheck *impl;
570 
571   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
572   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
573   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
574   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
575   CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, block, block + 1, t_mode, true, true, u, v, request));
576   return CEED_ERROR_SUCCESS;
577 }
578 
579 //------------------------------------------------------------------------------
580 // ElemRestriction Get Offsets
581 //------------------------------------------------------------------------------
582 static int CeedElemRestrictionGetOffsets_Memcheck(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) {
583   CeedElemRestriction_Memcheck *impl;
584 
585   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
586 
587   CeedCheck(mem_type == CEED_MEM_HOST, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_BACKEND, "Can only provide to HOST memory");
588 
589   *offsets = impl->offsets;
590   return CEED_ERROR_SUCCESS;
591 }
592 
593 //------------------------------------------------------------------------------
594 // ElemRestriction Get Orientations
595 //------------------------------------------------------------------------------
596 static int CeedElemRestrictionGetOrientations_Memcheck(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) {
597   CeedElemRestriction_Memcheck *impl;
598 
599   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
600 
601   CeedCheck(mem_type == CEED_MEM_HOST, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_BACKEND, "Can only provide to HOST memory");
602 
603   *orients = impl->orients;
604   return CEED_ERROR_SUCCESS;
605 }
606 
607 //------------------------------------------------------------------------------
608 // ElemRestriction Get Curl-Conforming Orientations
609 //------------------------------------------------------------------------------
610 static int CeedElemRestrictionGetCurlOrientations_Memcheck(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) {
611   CeedElemRestriction_Memcheck *impl;
612 
613   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
614 
615   CeedCheck(mem_type == CEED_MEM_HOST, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_BACKEND, "Can only provide to HOST memory");
616 
617   *curl_orients = impl->curl_orients;
618   return CEED_ERROR_SUCCESS;
619 }
620 
621 //------------------------------------------------------------------------------
622 // ElemRestriction Destroy
623 //------------------------------------------------------------------------------
624 static int CeedElemRestrictionDestroy_Memcheck(CeedElemRestriction rstr) {
625   CeedElemRestriction_Memcheck *impl;
626 
627   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
628   CeedCallBackend(CeedFree(&impl->offsets_allocated));
629   CeedCallBackend(CeedFree(&impl->orients_allocated));
630   CeedCallBackend(CeedFree(&impl->curl_orients_allocated));
631   CeedCallBackend(CeedFree(&impl));
632   return CEED_ERROR_SUCCESS;
633 }
634 
635 //------------------------------------------------------------------------------
636 // ElemRestriction Create
637 //------------------------------------------------------------------------------
638 int CeedElemRestrictionCreate_Memcheck(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, const bool *orients,
639                                        const CeedInt8 *curl_orients, CeedElemRestriction rstr) {
640   Ceed                          ceed;
641   CeedInt                       num_elem, elem_size, num_block, block_size, num_comp, comp_stride, num_points = 0, num_offsets;
642   CeedRestrictionType           rstr_type;
643   CeedElemRestriction_Memcheck *impl;
644 
645   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
646   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
647   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
648   CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block));
649   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
650   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
651   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
652   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
653 
654   CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Only MemType = HOST supported");
655 
656   CeedCallBackend(CeedCalloc(1, &impl));
657   CeedCallBackend(CeedElemRestrictionSetData(rstr, impl));
658 
659   // Set layouts
660   {
661     bool    has_backend_strides;
662     CeedInt e_layout[3] = {1, elem_size, elem_size * num_comp}, l_layout[3] = {0};
663 
664     CeedCallBackend(CeedElemRestrictionSetELayout(rstr, e_layout));
665     if (rstr_type == CEED_RESTRICTION_STRIDED) {
666       CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
667       if (has_backend_strides) {
668         CeedCallBackend(CeedElemRestrictionGetBackendStrides_Memcheck(rstr, l_layout));
669         CeedCallBackend(CeedElemRestrictionSetLLayout(rstr, l_layout));
670       }
671     }
672   }
673 
674   // Offsets data
675   if (rstr_type != CEED_RESTRICTION_STRIDED) {
676     const char *resource;
677 
678     // Check indices for ref or memcheck backends
679     {
680       Ceed current = ceed, parent = NULL;
681 
682       CeedCallBackend(CeedGetParent(current, &parent));
683       while (current != parent) {
684         current = parent;
685         CeedCallBackend(CeedGetParent(current, &parent));
686       }
687       CeedCallBackend(CeedGetResource(parent, &resource));
688     }
689     if (!strcmp(resource, "/cpu/self/ref/serial") || !strcmp(resource, "/cpu/self/ref/blocked") || !strcmp(resource, "/cpu/self/memcheck/serial") ||
690         !strcmp(resource, "/cpu/self/memcheck/blocked")) {
691       CeedSize l_size;
692 
693       CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
694       for (CeedInt i = 0; i < num_elem * elem_size; i++) {
695         CeedCheck(offsets[i] >= 0 && offsets[i] + (num_comp - 1) * comp_stride < l_size, ceed, CEED_ERROR_BACKEND,
696                   "Restriction offset %" CeedInt_FMT " (%" CeedInt_FMT ") out of range [0, %" CeedInt_FMT "]", i, offsets[i], l_size);
697       }
698     }
699 
700     // Copy data
701     if (rstr_type == CEED_RESTRICTION_POINTS) CeedCallBackend(CeedElemRestrictionGetNumPoints(rstr, &num_points));
702     num_offsets = rstr_type == CEED_RESTRICTION_POINTS ? (num_elem + 1 + num_points) : (num_elem * elem_size);
703     switch (copy_mode) {
704       case CEED_COPY_VALUES:
705         CeedCallBackend(CeedMalloc(num_offsets, &impl->offsets_allocated));
706         memcpy(impl->offsets_allocated, offsets, num_offsets * sizeof(offsets[0]));
707         impl->offsets = impl->offsets_allocated;
708         break;
709       case CEED_OWN_POINTER:
710         impl->offsets_allocated = (CeedInt *)offsets;
711         impl->offsets           = impl->offsets_allocated;
712         break;
713       case CEED_USE_POINTER:
714         impl->offsets = offsets;
715     }
716 
717     // Orientation data
718     if (rstr_type == CEED_RESTRICTION_ORIENTED) {
719       CeedCheck(orients != NULL, ceed, CEED_ERROR_BACKEND, "No orients array provided for oriented restriction");
720       switch (copy_mode) {
721         case CEED_COPY_VALUES:
722           CeedCallBackend(CeedMalloc(num_offsets, &impl->orients_allocated));
723           memcpy(impl->orients_allocated, orients, num_offsets * sizeof(orients[0]));
724           impl->orients = impl->orients_allocated;
725           break;
726         case CEED_OWN_POINTER:
727           impl->orients_allocated = (bool *)orients;
728           impl->orients           = impl->orients_allocated;
729           break;
730         case CEED_USE_POINTER:
731           impl->orients = orients;
732       }
733     } else if (rstr_type == CEED_RESTRICTION_CURL_ORIENTED) {
734       CeedCheck(curl_orients != NULL, ceed, CEED_ERROR_BACKEND, "No curl_orients array provided for oriented restriction");
735       switch (copy_mode) {
736         case CEED_COPY_VALUES:
737           CeedCallBackend(CeedMalloc(3 * num_offsets, &impl->curl_orients_allocated));
738           memcpy(impl->curl_orients_allocated, curl_orients, 3 * num_offsets * sizeof(curl_orients[0]));
739           impl->curl_orients = impl->curl_orients_allocated;
740           break;
741         case CEED_OWN_POINTER:
742           impl->curl_orients_allocated = (CeedInt8 *)curl_orients;
743           impl->curl_orients           = impl->curl_orients_allocated;
744           break;
745         case CEED_USE_POINTER:
746           impl->curl_orients = curl_orients;
747       }
748     }
749   }
750 
751   // Set apply function
752   impl->Apply = CeedElemRestrictionApply_Memcheck_Core;
753 
754   // Register backend functions
755   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Memcheck));
756   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Memcheck));
757   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Memcheck));
758   if (rstr_type == CEED_RESTRICTION_POINTS) {
759     CeedCallBackend(
760         CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement", CeedElemRestrictionApplyAtPointsInElement_Memcheck));
761   }
762   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyBlock", CeedElemRestrictionApplyBlock_Memcheck));
763   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Memcheck));
764   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Memcheck));
765   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Memcheck));
766   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Memcheck));
767   return CEED_ERROR_SUCCESS;
768 }
769 
770 //------------------------------------------------------------------------------
771