xref: /libCEED/backends/memcheck/ceed-memcheck-restriction.c (revision bc10c746797d546ceed65c88f3b6daeb3aa944ca) !
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <stdbool.h>
11 #include <stdlib.h>
12 #include <string.h>
13 
14 #include "ceed-memcheck.h"
15 
16 //------------------------------------------------------------------------------
17 // Set backend strides
18 //------------------------------------------------------------------------------
19 static inline int CeedElemRestrictionGetBackendStrides_Memcheck(CeedElemRestriction rstr, CeedInt strides[3]) {
20   CeedInt elem_size, num_comp, num_elem;
21 
22   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
23   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
24   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
25   // Memcheck default, contiguous by component, then node
26   strides[0] = num_comp;
27   strides[1] = 1;
28   strides[2] = num_comp * elem_size;
29   /**
30       // CPU default, contiguous by node, then component
31       strides[0] = 1;
32       strides[1] = elem_size;
33       strides[2] = elem_size * num_comp;
34 
35       // GPU default, contiguous by node, then element
36       strides[0] = 1;
37       strides[1] = num_elem * elem_size;
38       strides[2] = elem_size;
39      **/
40   return CEED_ERROR_SUCCESS;
41 }
42 
43 //------------------------------------------------------------------------------
44 // Core ElemRestriction Apply Code
45 //------------------------------------------------------------------------------
46 static inline int CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
47                                                                            CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size,
48                                                                            CeedSize v_offset, const CeedScalar *__restrict__ uu,
49                                                                            CeedScalar *__restrict__ vv) {
50   // Get strides
51   bool    has_backend_strides;
52   CeedInt strides[3] = {0};
53 
54   CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
55   if (has_backend_strides) CeedCallBackend(CeedElemRestrictionGetBackendStrides_Memcheck(rstr, strides));
56   else CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
57 
58   // Apply restriction
59   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
60     CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
61       CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
62         CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
63           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
64               uu[n * strides[0] + k * strides[1] + CeedIntMin(e + j, num_elem - 1) * (CeedSize)strides[2]];
65         }
66       }
67     }
68   }
69   return CEED_ERROR_SUCCESS;
70 }
71 
72 static inline int CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
73                                                                           const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
74                                                                           CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
75                                                                           CeedScalar *__restrict__ vv) {
76   // Default restriction with offsets
77   CeedElemRestriction_Memcheck *impl;
78 
79   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
80   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
81     CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
82       CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) {
83         vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] = uu[impl->offsets[i + e * elem_size] + k * comp_stride];
84       }
85     }
86   }
87   return CEED_ERROR_SUCCESS;
88 }
89 
90 static inline int CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp,
91                                                                             const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
92                                                                             CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset,
93                                                                             const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
94   // Restriction with orientations
95   CeedElemRestriction_Memcheck *impl;
96 
97   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
98   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
99     CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
100       CeedPragmaSIMD for (CeedSize i = 0; i < elem_size * block_size; i++) {
101         vv[elem_size * (k * block_size + e * num_comp) + i - v_offset] =
102             uu[impl->offsets[i + e * elem_size] + k * comp_stride] * (impl->orients[i + e * elem_size] ? -1.0 : 1.0);
103       }
104     }
105   }
106   return CEED_ERROR_SUCCESS;
107 }
108 
109 static inline int CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp,
110                                                                                 const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
111                                                                                 CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset,
112                                                                                 const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
113   // Restriction with tridiagonal transformation
114   CeedElemRestriction_Memcheck *impl;
115 
116   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
117   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
118     CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
119       CeedSize n = 0;
120 
121       CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
122         vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
123             uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
124                 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
125             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
126                 impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size];
127       }
128       CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
129         CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
130           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
131               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
132                   impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size] +
133               uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
134                   impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
135               uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
136                   impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size];
137         }
138       }
139       CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
140         vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
141             uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
142                 impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size] +
143             uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
144                 impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size];
145       }
146     }
147   }
148   return CEED_ERROR_SUCCESS;
149 }
150 
151 static inline int CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memcheck_Core(
152     CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop,
153     CeedInt num_elem, CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
154   // Restriction with (unsigned) tridiagonal transformation
155   CeedElemRestriction_Memcheck *impl;
156 
157   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
158   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
159     CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
160       CeedSize n = 0;
161 
162       CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
163         vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
164             uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
165                 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
166             uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
167                 abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]);
168       }
169       CeedPragmaSIMD for (n = 1; n < elem_size - 1; n++) {
170         CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
171           vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
172               uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
173                   abs(impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size]) +
174               uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
175                   abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
176               uu[impl->offsets[j + (n + 1) * block_size + e * elem_size] + k * comp_stride] *
177                   abs(impl->curl_orients[j + (3 * n + 2) * block_size + e * 3 * elem_size]);
178         }
179       }
180       CeedPragmaSIMD for (CeedSize j = 0; j < block_size; j++) {
181         vv[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] =
182             uu[impl->offsets[j + (n - 1) * block_size + e * elem_size] + k * comp_stride] *
183                 abs(impl->curl_orients[j + (3 * n + 0) * block_size + e * 3 * elem_size]) +
184             uu[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] *
185                 abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]);
186       }
187     }
188   }
189   return CEED_ERROR_SUCCESS;
190 }
191 
192 static inline int CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
193                                                                          CeedInt start, CeedInt stop, CeedInt num_elem, CeedInt elem_size,
194                                                                          CeedSize v_offset, const CeedScalar *__restrict__ uu,
195                                                                          CeedScalar *__restrict__ vv) {
196   // Get strides
197   bool    has_backend_strides;
198   CeedInt strides[3] = {0};
199 
200   CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
201   if (has_backend_strides) CeedCallBackend(CeedElemRestrictionGetBackendStrides_Memcheck(rstr, strides));
202   else CeedCallBackend(CeedElemRestrictionGetStrides(rstr, strides));
203 
204   // Apply restriction
205   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
206     CeedPragmaSIMD for (CeedSize k = 0; k < num_comp; k++) {
207       CeedPragmaSIMD for (CeedSize n = 0; n < elem_size; n++) {
208         CeedPragmaSIMD for (CeedSize j = 0; j < CeedIntMin(block_size, num_elem - e); j++) {
209           vv[n * strides[0] + k * strides[1] + (e + j) * strides[2]] +=
210               uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset];
211         }
212       }
213     }
214   }
215   return CEED_ERROR_SUCCESS;
216 }
217 
218 static inline int CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
219                                                                         const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
220                                                                         CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
221                                                                         CeedScalar *__restrict__ vv) {
222   // Default restriction with offsets
223   CeedElemRestriction_Memcheck *impl;
224 
225   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
226   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
227     for (CeedSize k = 0; k < num_comp; k++) {
228       for (CeedSize i = 0; i < elem_size * block_size; i += block_size) {
229         // Iteration bound set to discard padding elements
230         for (CeedSize j = i; j < i + CeedIntMin(block_size, num_elem - e); j++) {
231           CeedScalar vv_loc;
232 
233           vv_loc = uu[elem_size * (k * block_size + e * num_comp) + j - v_offset];
234           CeedPragmaAtomic vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc;
235         }
236       }
237     }
238   }
239   return CEED_ERROR_SUCCESS;
240 }
241 
242 static inline int CeedElemRestrictionApplyOrientedTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
243                                                                           const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedInt num_elem,
244                                                                           CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu,
245                                                                           CeedScalar *__restrict__ vv) {
246   // Restriction with orientations
247   CeedElemRestriction_Memcheck *impl;
248 
249   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
250   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
251     for (CeedSize k = 0; k < num_comp; k++) {
252       for (CeedSize i = 0; i < elem_size * block_size; i += block_size) {
253         // Iteration bound set to discard padding elements
254         for (CeedSize j = i; j < i + CeedIntMin(block_size, num_elem - e); j++) {
255           CeedScalar vv_loc;
256 
257           vv_loc = uu[elem_size * (k * block_size + e * num_comp) + j - v_offset] * (impl->orients[j + e * elem_size] ? -1.0 : 1.0);
258           CeedPragmaAtomic vv[impl->offsets[j + e * elem_size] + k * comp_stride] += vv_loc;
259         }
260       }
261     }
262   }
263   return CEED_ERROR_SUCCESS;
264 }
265 
266 static inline int CeedElemRestrictionApplyCurlOrientedTranspose_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp,
267                                                                               const CeedInt block_size, const CeedInt comp_stride, CeedInt start,
268                                                                               CeedInt stop, CeedInt num_elem, CeedInt elem_size, CeedSize v_offset,
269                                                                               const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
270   // Restriction with tridiagonal transformation
271   CeedElemRestriction_Memcheck *impl;
272   CeedScalar                    vv_loc[block_size];
273 
274   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
275   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
276     for (CeedSize k = 0; k < num_comp; k++) {
277       // Iteration bound set to discard padding elements
278       const CeedSize block_end = CeedIntMin(block_size, num_elem - e);
279       CeedSize       n         = 0;
280 
281       CeedPragmaSIMD for (CeedSize j = 0; j < block_end; j++) {
282         vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] *
283                         impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
284                     uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] *
285                         impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size];
286       }
287       for (CeedSize j = 0; j < block_end; j++) {
288         CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
289       }
290       for (n = 1; n < elem_size - 1; n++) {
291         CeedPragmaSIMD for (CeedSize j = 0; j < block_end; j++) {
292           vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] *
293                           impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size] +
294                       uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] *
295                           impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size] +
296                       uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] *
297                           impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size];
298         }
299         for (CeedSize j = 0; j < block_end; j++) {
300           CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
301         }
302       }
303       CeedPragmaSIMD for (CeedSize j = 0; j < block_end; j++) {
304         vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] *
305                         impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size] +
306                     uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] *
307                         impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size];
308       }
309       for (CeedSize j = 0; j < block_end; j++) {
310         CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
311       }
312     }
313   }
314   return CEED_ERROR_SUCCESS;
315 }
316 
317 static inline int CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Memcheck_Core(
318     CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size, const CeedInt comp_stride, CeedInt start, CeedInt stop,
319     CeedInt num_elem, CeedInt elem_size, CeedSize v_offset, const CeedScalar *__restrict__ uu, CeedScalar *__restrict__ vv) {
320   // Restriction with (unsigned) tridiagonal transformation
321   CeedElemRestriction_Memcheck *impl;
322   CeedScalar                    vv_loc[block_size];
323 
324   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
325   for (CeedSize e = start * block_size; e < stop * block_size; e += block_size) {
326     for (CeedSize k = 0; k < num_comp; k++) {
327       // Iteration bound set to discard padding elements
328       const CeedSize block_end = CeedIntMin(block_size, num_elem - e);
329       CeedSize       n         = 0;
330 
331       CeedPragmaSIMD for (CeedSize j = 0; j < block_end; j++) {
332         vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] *
333                         abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
334                     uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] *
335                         abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]);
336       }
337       for (CeedSize j = 0; j < block_end; j++) {
338         CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
339       }
340       for (n = 1; n < elem_size - 1; n++) {
341         CeedPragmaSIMD for (CeedSize j = 0; j < block_end; j++) {
342           vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] *
343                           abs(impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size]) +
344                       uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] *
345                           abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]) +
346                       uu[e * elem_size * num_comp + (k * elem_size + n + 1) * block_size + j - v_offset] *
347                           abs(impl->curl_orients[j + (3 * n + 3) * block_size + e * 3 * elem_size]);
348         }
349         for (CeedSize j = 0; j < block_end; j++) {
350           CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
351         }
352       }
353       CeedPragmaSIMD for (CeedSize j = 0; j < block_end; j++) {
354         vv_loc[j] = uu[e * elem_size * num_comp + (k * elem_size + n - 1) * block_size + j - v_offset] *
355                         abs(impl->curl_orients[j + (3 * n - 1) * block_size + e * 3 * elem_size]) +
356                     uu[e * elem_size * num_comp + (k * elem_size + n) * block_size + j - v_offset] *
357                         abs(impl->curl_orients[j + (3 * n + 1) * block_size + e * 3 * elem_size]);
358       }
359       for (CeedSize j = 0; j < block_end; j++) {
360         CeedPragmaAtomic vv[impl->offsets[j + n * block_size + e * elem_size] + k * comp_stride] += vv_loc[j];
361       }
362     }
363   }
364   return CEED_ERROR_SUCCESS;
365 }
366 
367 static inline int CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, CeedInt start,
368                                                                           CeedInt stop, CeedTransposeMode t_mode, const CeedScalar *__restrict__ uu,
369                                                                           CeedScalar *__restrict__ vv) {
370   CeedInt                       num_points, l_vec_offset;
371   CeedSize                      e_vec_offset = 0;
372   CeedElemRestriction_Memcheck *impl;
373 
374   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
375   for (CeedSize e = start; e < stop; e++) {
376     l_vec_offset = impl->offsets[e];
377     CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr, e, &num_points));
378     if (t_mode == CEED_NOTRANSPOSE) {
379       for (CeedSize i = 0; i < num_points; i++) {
380         for (CeedSize j = 0; j < num_comp; j++) vv[j * num_points + i + e_vec_offset] = uu[impl->offsets[i + l_vec_offset] * num_comp + j];
381       }
382     } else {
383       for (CeedSize i = 0; i < num_points; i++) {
384         for (CeedSize j = 0; j < num_comp; j++) vv[impl->offsets[i + l_vec_offset] * num_comp + j] = uu[j * num_points + i + e_vec_offset];
385       }
386     }
387     e_vec_offset += num_points * (CeedSize)num_comp;
388   }
389   return CEED_ERROR_SUCCESS;
390 }
391 
392 static inline int CeedElemRestrictionApply_Memcheck_Core(CeedElemRestriction rstr, const CeedInt num_comp, const CeedInt block_size,
393                                                          const CeedInt comp_stride, CeedInt start, CeedInt stop, CeedTransposeMode t_mode,
394                                                          bool use_signs, bool use_orients, CeedVector u, CeedVector v, CeedRequest *request) {
395   CeedInt             num_elem, elem_size;
396   CeedSize            v_offset;
397   CeedRestrictionType rstr_type;
398   const CeedScalar   *uu;
399   CeedScalar         *vv;
400 
401   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
402   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
403   v_offset = start * block_size * elem_size * (CeedSize)num_comp;
404   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
405   CeedCallBackend(CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu));
406 
407   if (t_mode == CEED_TRANSPOSE) {
408     // Sum into for transpose mode, E-vector to L-vector
409     CeedCallBackend(CeedVectorGetArray(v, CEED_MEM_HOST, &vv));
410   } else {
411     // Overwrite for notranspose mode, L-vector to E-vector
412     CeedCallBackend(CeedVectorGetArrayWrite(v, CEED_MEM_HOST, &vv));
413   }
414 
415   if (t_mode == CEED_TRANSPOSE) {
416     // Restriction from E-vector to L-vector
417     // Performing v += r^T * u
418     // uu has shape [elem_size, num_comp, num_elem], row-major
419     // vv has shape [nnodes, num_comp]
420     // Sum into for transpose mode
421     switch (rstr_type) {
422       case CEED_RESTRICTION_STRIDED:
423         CeedCallBackend(
424             CeedElemRestrictionApplyStridedTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
425         break;
426       case CEED_RESTRICTION_STANDARD:
427         CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
428                                                                               elem_size, v_offset, uu, vv));
429         break;
430       case CEED_RESTRICTION_ORIENTED:
431         if (use_signs) {
432           CeedCallBackend(CeedElemRestrictionApplyOrientedTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
433                                                                                   elem_size, v_offset, uu, vv));
434         } else {
435           CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
436                                                                                 elem_size, v_offset, uu, vv));
437         }
438         break;
439       case CEED_RESTRICTION_CURL_ORIENTED:
440         if (use_signs && use_orients) {
441           CeedCallBackend(CeedElemRestrictionApplyCurlOrientedTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
442                                                                                       elem_size, v_offset, uu, vv));
443         } else if (use_orients) {
444           CeedCallBackend(CeedElemRestrictionApplyCurlOrientedUnsignedTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop,
445                                                                                               num_elem, elem_size, v_offset, uu, vv));
446         } else {
447           CeedCallBackend(CeedElemRestrictionApplyOffsetTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
448                                                                                 elem_size, v_offset, uu, vv));
449         }
450         break;
451       case CEED_RESTRICTION_POINTS:
452         CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(rstr, num_comp, start, stop, t_mode, uu, vv));
453         break;
454     }
455   } else {
456     // Restriction from L-vector to E-vector
457     // Perform: v = r * u
458     // vv has shape [elem_size, num_comp, num_elem], row-major
459     // uu has shape [nnodes, num_comp]
460     // Overwrite for notranspose mode
461     switch (rstr_type) {
462       case CEED_RESTRICTION_STRIDED:
463         CeedCallBackend(
464             CeedElemRestrictionApplyStridedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, start, stop, num_elem, elem_size, v_offset, uu, vv));
465         break;
466       case CEED_RESTRICTION_STANDARD:
467         CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
468                                                                                 elem_size, v_offset, uu, vv));
469         break;
470       case CEED_RESTRICTION_ORIENTED:
471         if (use_signs) {
472           CeedCallBackend(CeedElemRestrictionApplyOrientedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
473                                                                                     elem_size, v_offset, uu, vv));
474         } else {
475           CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
476                                                                                   elem_size, v_offset, uu, vv));
477         }
478         break;
479       case CEED_RESTRICTION_CURL_ORIENTED:
480         if (use_signs && use_orients) {
481           CeedCallBackend(CeedElemRestrictionApplyCurlOrientedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop,
482                                                                                         num_elem, elem_size, v_offset, uu, vv));
483         } else if (use_orients) {
484           CeedCallBackend(CeedElemRestrictionApplyCurlOrientedUnsignedNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop,
485                                                                                                 num_elem, elem_size, v_offset, uu, vv));
486         } else {
487           CeedCallBackend(CeedElemRestrictionApplyOffsetNoTranspose_Memcheck_Core(rstr, num_comp, block_size, comp_stride, start, stop, num_elem,
488                                                                                   elem_size, v_offset, uu, vv));
489         }
490         break;
491       case CEED_RESTRICTION_POINTS:
492         CeedCallBackend(CeedElemRestrictionApplyAtPointsInElement_Memcheck_Core(rstr, num_comp, start, stop, t_mode, uu, vv));
493         break;
494     }
495   }
496   CeedCallBackend(CeedVectorRestoreArrayRead(u, &uu));
497   CeedCallBackend(CeedVectorRestoreArray(v, &vv));
498   if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED) *request = NULL;
499   return CEED_ERROR_SUCCESS;
500 }
501 
502 //------------------------------------------------------------------------------
503 // ElemRestriction Apply
504 //------------------------------------------------------------------------------
505 static int CeedElemRestrictionApply_Memcheck(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v, CeedRequest *request) {
506   CeedInt                       num_block, block_size, num_comp, comp_stride;
507   CeedElemRestriction_Memcheck *impl;
508 
509   CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block));
510   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
511   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
512   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
513   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
514   CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, 0, num_block, t_mode, true, true, u, v, request));
515   return CEED_ERROR_SUCCESS;
516 }
517 
518 //------------------------------------------------------------------------------
519 // ElemRestriction Apply Unsigned
520 //------------------------------------------------------------------------------
521 static int CeedElemRestrictionApplyUnsigned_Memcheck(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v,
522                                                      CeedRequest *request) {
523   CeedInt                       num_block, block_size, num_comp, comp_stride;
524   CeedElemRestriction_Memcheck *impl;
525 
526   CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block));
527   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
528   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
529   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
530   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
531   CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, 0, num_block, t_mode, false, true, u, v, request));
532   return CEED_ERROR_SUCCESS;
533 }
534 
535 //------------------------------------------------------------------------------
536 // ElemRestriction Apply Unoriented
537 //------------------------------------------------------------------------------
538 static int CeedElemRestrictionApplyUnoriented_Memcheck(CeedElemRestriction rstr, CeedTransposeMode t_mode, CeedVector u, CeedVector v,
539                                                        CeedRequest *request) {
540   CeedInt                       num_block, block_size, num_comp, comp_stride;
541   CeedElemRestriction_Memcheck *impl;
542 
543   CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block));
544   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
545   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
546   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
547   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
548   CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, 0, num_block, t_mode, false, false, u, v, request));
549   return CEED_ERROR_SUCCESS;
550 }
551 
552 //------------------------------------------------------------------------------
553 // ElemRestriction Apply Points
554 //------------------------------------------------------------------------------
555 static int CeedElemRestrictionApplyAtPointsInElement_Memcheck(CeedElemRestriction rstr, CeedInt elem, CeedTransposeMode t_mode, CeedVector u,
556                                                               CeedVector v, CeedRequest *request) {
557   CeedInt                       num_comp;
558   CeedElemRestriction_Memcheck *impl;
559 
560   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
561   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
562   return impl->Apply(rstr, num_comp, 0, 1, elem, elem + 1, t_mode, false, false, u, v, request);
563 }
564 
565 //------------------------------------------------------------------------------
566 // ElemRestriction Apply Block
567 //------------------------------------------------------------------------------
568 static int CeedElemRestrictionApplyBlock_Memcheck(CeedElemRestriction rstr, CeedInt block, CeedTransposeMode t_mode, CeedVector u, CeedVector v,
569                                                   CeedRequest *request) {
570   CeedInt                       block_size, num_comp, comp_stride;
571   CeedElemRestriction_Memcheck *impl;
572 
573   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
574   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
575   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
576   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
577   CeedCallBackend(impl->Apply(rstr, num_comp, block_size, comp_stride, block, block + 1, t_mode, true, true, u, v, request));
578   return CEED_ERROR_SUCCESS;
579 }
580 
581 //------------------------------------------------------------------------------
582 // ElemRestriction Get Offsets
583 //------------------------------------------------------------------------------
584 static int CeedElemRestrictionGetOffsets_Memcheck(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt **offsets) {
585   CeedElemRestriction_Memcheck *impl;
586 
587   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
588 
589   CeedCheck(mem_type == CEED_MEM_HOST, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_BACKEND, "Can only provide to HOST memory");
590 
591   *offsets = impl->offsets;
592   return CEED_ERROR_SUCCESS;
593 }
594 
595 //------------------------------------------------------------------------------
596 // ElemRestriction Get Orientations
597 //------------------------------------------------------------------------------
598 static int CeedElemRestrictionGetOrientations_Memcheck(CeedElemRestriction rstr, CeedMemType mem_type, const bool **orients) {
599   CeedElemRestriction_Memcheck *impl;
600 
601   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
602 
603   CeedCheck(mem_type == CEED_MEM_HOST, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_BACKEND, "Can only provide to HOST memory");
604 
605   *orients = impl->orients;
606   return CEED_ERROR_SUCCESS;
607 }
608 
609 //------------------------------------------------------------------------------
610 // ElemRestriction Get Curl-Conforming Orientations
611 //------------------------------------------------------------------------------
612 static int CeedElemRestrictionGetCurlOrientations_Memcheck(CeedElemRestriction rstr, CeedMemType mem_type, const CeedInt8 **curl_orients) {
613   CeedElemRestriction_Memcheck *impl;
614 
615   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
616 
617   CeedCheck(mem_type == CEED_MEM_HOST, CeedElemRestrictionReturnCeed(rstr), CEED_ERROR_BACKEND, "Can only provide to HOST memory");
618 
619   *curl_orients = impl->curl_orients;
620   return CEED_ERROR_SUCCESS;
621 }
622 
623 //------------------------------------------------------------------------------
624 // ElemRestriction Destroy
625 //------------------------------------------------------------------------------
626 static int CeedElemRestrictionDestroy_Memcheck(CeedElemRestriction rstr) {
627   CeedElemRestriction_Memcheck *impl;
628 
629   CeedCallBackend(CeedElemRestrictionGetData(rstr, &impl));
630   CeedCallBackend(CeedFree(&impl->offsets_allocated));
631   CeedCallBackend(CeedFree(&impl->orients_allocated));
632   CeedCallBackend(CeedFree(&impl->curl_orients_allocated));
633   CeedCallBackend(CeedFree(&impl));
634   return CEED_ERROR_SUCCESS;
635 }
636 
637 //------------------------------------------------------------------------------
638 // ElemRestriction Create
639 //------------------------------------------------------------------------------
640 int CeedElemRestrictionCreate_Memcheck(CeedMemType mem_type, CeedCopyMode copy_mode, const CeedInt *offsets, const bool *orients,
641                                        const CeedInt8 *curl_orients, CeedElemRestriction rstr) {
642   Ceed                          ceed;
643   CeedInt                       num_elem, elem_size, num_block, block_size, num_comp, comp_stride, num_points = 0, num_offsets;
644   CeedRestrictionType           rstr_type;
645   CeedElemRestriction_Memcheck *impl;
646 
647   CeedCallBackend(CeedElemRestrictionGetCeed(rstr, &ceed));
648   CeedCallBackend(CeedElemRestrictionGetNumElements(rstr, &num_elem));
649   CeedCallBackend(CeedElemRestrictionGetElementSize(rstr, &elem_size));
650   CeedCallBackend(CeedElemRestrictionGetNumBlocks(rstr, &num_block));
651   CeedCallBackend(CeedElemRestrictionGetBlockSize(rstr, &block_size));
652   CeedCallBackend(CeedElemRestrictionGetNumComponents(rstr, &num_comp));
653   CeedCallBackend(CeedElemRestrictionGetCompStride(rstr, &comp_stride));
654   CeedCallBackend(CeedElemRestrictionGetType(rstr, &rstr_type));
655 
656   CeedCheck(mem_type == CEED_MEM_HOST, ceed, CEED_ERROR_BACKEND, "Only MemType = HOST supported");
657 
658   CeedCallBackend(CeedCalloc(1, &impl));
659   CeedCallBackend(CeedElemRestrictionSetData(rstr, impl));
660 
661   // Set layouts
662   {
663     bool    has_backend_strides;
664     CeedInt e_layout[3] = {1, elem_size, elem_size * num_comp}, l_layout[3] = {0};
665 
666     CeedCallBackend(CeedElemRestrictionSetELayout(rstr, e_layout));
667     if (rstr_type == CEED_RESTRICTION_STRIDED) {
668       CeedCallBackend(CeedElemRestrictionHasBackendStrides(rstr, &has_backend_strides));
669       if (has_backend_strides) {
670         CeedCallBackend(CeedElemRestrictionGetBackendStrides_Memcheck(rstr, l_layout));
671         CeedCallBackend(CeedElemRestrictionSetLLayout(rstr, l_layout));
672       }
673     }
674   }
675 
676   // Offsets data
677   if (rstr_type != CEED_RESTRICTION_STRIDED) {
678     // Check indices
679     {
680       CeedSize l_size;
681 
682       CeedCallBackend(CeedElemRestrictionGetLVectorSize(rstr, &l_size));
683       for (CeedInt i = 0; i < num_elem * elem_size; i++) {
684         CeedCheck(offsets[i] >= 0 && offsets[i] + (num_comp - 1) * comp_stride < l_size, ceed, CEED_ERROR_BACKEND,
685                   "Restriction offset %" CeedInt_FMT " (%" CeedInt_FMT ") out of range [0, %" CeedInt_FMT "]", i, offsets[i], l_size);
686       }
687     }
688 
689     // Copy data
690     if (rstr_type == CEED_RESTRICTION_POINTS) CeedCallBackend(CeedElemRestrictionGetNumPoints(rstr, &num_points));
691     num_offsets = rstr_type == CEED_RESTRICTION_POINTS ? (num_elem + 1 + num_points) : (num_elem * elem_size);
692     switch (copy_mode) {
693       case CEED_COPY_VALUES:
694         CeedCallBackend(CeedMalloc(num_offsets, &impl->offsets_allocated));
695         memcpy(impl->offsets_allocated, offsets, num_offsets * sizeof(offsets[0]));
696         impl->offsets = impl->offsets_allocated;
697         break;
698       case CEED_OWN_POINTER:
699         impl->offsets_allocated = (CeedInt *)offsets;
700         impl->offsets           = impl->offsets_allocated;
701         break;
702       case CEED_USE_POINTER:
703         impl->offsets = offsets;
704     }
705 
706     // Orientation data
707     if (rstr_type == CEED_RESTRICTION_ORIENTED) {
708       CeedCheck(orients != NULL, ceed, CEED_ERROR_BACKEND, "No orients array provided for oriented restriction");
709       switch (copy_mode) {
710         case CEED_COPY_VALUES:
711           CeedCallBackend(CeedMalloc(num_offsets, &impl->orients_allocated));
712           memcpy(impl->orients_allocated, orients, num_offsets * sizeof(orients[0]));
713           impl->orients = impl->orients_allocated;
714           break;
715         case CEED_OWN_POINTER:
716           impl->orients_allocated = (bool *)orients;
717           impl->orients           = impl->orients_allocated;
718           break;
719         case CEED_USE_POINTER:
720           impl->orients = orients;
721       }
722     } else if (rstr_type == CEED_RESTRICTION_CURL_ORIENTED) {
723       CeedCheck(curl_orients != NULL, ceed, CEED_ERROR_BACKEND, "No curl_orients array provided for oriented restriction");
724       switch (copy_mode) {
725         case CEED_COPY_VALUES:
726           CeedCallBackend(CeedMalloc(3 * num_offsets, &impl->curl_orients_allocated));
727           memcpy(impl->curl_orients_allocated, curl_orients, 3 * num_offsets * sizeof(curl_orients[0]));
728           impl->curl_orients = impl->curl_orients_allocated;
729           break;
730         case CEED_OWN_POINTER:
731           impl->curl_orients_allocated = (CeedInt8 *)curl_orients;
732           impl->curl_orients           = impl->curl_orients_allocated;
733           break;
734         case CEED_USE_POINTER:
735           impl->curl_orients = curl_orients;
736       }
737     }
738   }
739 
740   // Set apply function
741   impl->Apply = CeedElemRestrictionApply_Memcheck_Core;
742 
743   // Register backend functions
744   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Apply", CeedElemRestrictionApply_Memcheck));
745   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnsigned", CeedElemRestrictionApplyUnsigned_Memcheck));
746   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyUnoriented", CeedElemRestrictionApplyUnoriented_Memcheck));
747   if (rstr_type == CEED_RESTRICTION_POINTS) {
748     CeedCallBackend(
749         CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyAtPointsInElement", CeedElemRestrictionApplyAtPointsInElement_Memcheck));
750   }
751   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "ApplyBlock", CeedElemRestrictionApplyBlock_Memcheck));
752   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOffsets", CeedElemRestrictionGetOffsets_Memcheck));
753   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetOrientations", CeedElemRestrictionGetOrientations_Memcheck));
754   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "GetCurlOrientations", CeedElemRestrictionGetCurlOrientations_Memcheck));
755   CeedCallBackend(CeedSetBackendFunction(ceed, "ElemRestriction", rstr, "Destroy", CeedElemRestrictionDestroy_Memcheck));
756   return CEED_ERROR_SUCCESS;
757 }
758 
759 //------------------------------------------------------------------------------
760