xref: /petsc/src/sys/utils/sortso.c (revision 9371c9d470a9602b6d10a8bf50c9b2280a79e45a)
1 #include <petscsys.h> /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 static inline int Compare_PetscMPIInt_Private(const void *left, const void *right, PETSC_UNUSED void *ctx) {
5   PetscMPIInt l = *(PetscMPIInt *)left, r = *(PetscMPIInt *)right;
6   return (l < r) ? -1 : (l > r);
7 }
8 
9 static inline int Compare_PetscInt_Private(const void *left, const void *right, PETSC_UNUSED void *ctx) {
10   PetscInt l = *(PetscInt *)left, r = *(PetscInt *)right;
11   return (l < r) ? -1 : (l > r);
12 }
13 
14 static inline int Compare_PetscReal_Private(const void *left, const void *right, PETSC_UNUSED void *ctx) {
15   PetscReal l = *(PetscReal *)left, r = *(PetscReal *)right;
16   return (l < r) ? -1 : (l > r);
17 }
18 
19 #define MIN_GALLOP_CONST_GLOBAL 8
20 static PetscInt MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
21 typedef int (*CompFunc)(const void *, const void *, void *);
22 
23 /* Mostly to force clang uses the builtins, but can't hurt to have gcc compilers also do the same */
24 #if !defined(PETSC_USE_DEBUG) && defined(__GNUC__)
25 static inline void COPYSWAPPY(char *a, char *b, char *t, size_t size) {
26   __builtin_memcpy(t, b, size);
27   __builtin_memmove(b, a, size);
28   __builtin_memcpy(a, t, size);
29   return;
30 }
31 
32 static inline void COPYSWAPPY2(char *al, char *ar, size_t asize, char *bl, char *br, size_t bsize, char *t) {
33   __builtin_memcpy(t, ar, asize);
34   __builtin_memmove(ar, al, asize);
35   __builtin_memcpy(al, t, asize);
36   __builtin_memcpy(t, br, bsize);
37   __builtin_memmove(br, bl, bsize);
38   __builtin_memcpy(bl, t, bsize);
39   return;
40 }
41 
42 static inline void Petsc_memcpy(char *dest, const char *src, size_t size) {
43   __builtin_memcpy(dest, src, size);
44   return;
45 }
46 
47 static inline void Petsc_memcpy2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize) {
48   __builtin_memcpy(adest, asrc, asize);
49   __builtin_memcpy(bdest, bsrc, bsize);
50   return;
51 }
52 
53 static inline void Petsc_memmove(char *dest, const char *src, size_t size) {
54   __builtin_memmove(dest, src, size);
55   return;
56 }
57 
58 static inline void Petsc_memmove2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize) {
59   __builtin_memmove(adest, asrc, asize);
60   __builtin_memmove(bdest, bsrc, bsize);
61   return;
62 }
63 #else
64 static inline void COPYSWAPPY(char *a, char *b, char *t, size_t size) {
65   PetscFunctionBegin;
66   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(t, b, size));
67   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(b, a, size));
68   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(a, t, size));
69   PetscFunctionReturnVoid();
70 }
71 
72 static inline void COPYSWAPPY2(char *al, char *ar, size_t asize, char *bl, char *br, size_t bsize, char *t) {
73   PetscFunctionBegin;
74   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(t, ar, asize));
75   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(ar, al, asize));
76   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(al, t, asize));
77   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(t, br, bsize));
78   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(br, bl, bsize));
79   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(bl, t, bsize));
80   PetscFunctionReturnVoid();
81 }
82 
83 static inline void Petsc_memcpy(char *dest, const char *src, size_t size) {
84   PetscFunctionBegin;
85   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(dest, src, size));
86   PetscFunctionReturnVoid();
87 }
88 
89 static inline void Petsc_memcpy2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize) {
90   PetscFunctionBegin;
91   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(adest, asrc, asize));
92   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(bdest, bsrc, bsize));
93   PetscFunctionReturnVoid();
94 }
95 
96 static inline void Petsc_memmove(char *dest, const char *src, size_t size) {
97   PetscFunctionBegin;
98   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(dest, src, size));
99   PetscFunctionReturnVoid();
100 }
101 
102 static inline void Petsc_memmove2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize) {
103   PetscFunctionBegin;
104   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(adest, asrc, asize));
105   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(bdest, bsrc, bsize));
106   PetscFunctionReturnVoid();
107 }
108 #endif
109 
110 /* Start left look right. Looking for e.g. B[0] in A or mergelo. l inclusive, r inclusive. Returns first m such that arr[m] >
111  x. Output also inclusive.
112 
113  NOTE: Both gallopsearch functions CANNOT distinguish between inserting AFTER the entire array vs inserting at entry n!! For example for an array:
114 
115    {0,1,2,3,4,5}
116 
117    when looking to insert "5" this routine will return *m = 6, but when looking to insert "6" it will ALSO return *m = 6.
118   */
119 static inline PetscErrorCode PetscGallopSearchLeft_Private(const char *arr, size_t size, CompFunc cmp, void *ctx, PetscInt l, PetscInt r, const char *x, PetscInt *m) {
120   PetscInt last = l, k = 1, mid, cur = l + 1;
121 
122   PetscFunctionBegin;
123   *m = l;
124   PetscAssert(r >= l, PETSC_COMM_SELF, PETSC_ERR_PLIB, "r %" PetscInt_FMT " < l %" PetscInt_FMT " in PetscGallopSearchLeft", r, l);
125   if ((*cmp)(x, arr + r * size, ctx) >= 0) {
126     *m = r;
127     PetscFunctionReturn(0);
128   }
129   if ((*cmp)(x, (arr) + l * size, ctx) < 0 || PetscUnlikely(!(r - l))) PetscFunctionReturn(0);
130   while (PETSC_TRUE) {
131     if (PetscUnlikely(cur > r)) {
132       cur = r;
133       break;
134     }
135     if ((*cmp)(x, (arr) + cur * size, ctx) < 0) break;
136     last = cur;
137     cur += (k <<= 1) + 1;
138     ++k;
139   }
140   /* standard binary search but take last 0 mid 0 cur 1 into account*/
141   while (cur > last + 1) {
142     mid = last + ((cur - last) >> 1);
143     if ((*cmp)(x, (arr) + mid * size, ctx) < 0) {
144       cur = mid;
145     } else {
146       last = mid;
147     }
148   }
149   *m = cur;
150   PetscFunctionReturn(0);
151 }
152 
153 /* Start right look left. Looking for e.g. A[-1] in B or mergehi. l inclusive, r inclusive. Returns last m such that arr[m]
154  < x. Output also inclusive */
155 static inline PetscErrorCode PetscGallopSearchRight_Private(const char *arr, size_t size, CompFunc cmp, void *ctx, PetscInt l, PetscInt r, const char *x, PetscInt *m) {
156   PetscInt last = r, k = 1, mid, cur = r - 1;
157 
158   PetscFunctionBegin;
159   *m = r;
160   PetscAssert(r >= l, PETSC_COMM_SELF, PETSC_ERR_PLIB, "r %" PetscInt_FMT " < l %" PetscInt_FMT " in PetscGallopSearchRight", r, l);
161   if ((*cmp)(x, arr + l * size, ctx) <= 0) {
162     *m = l;
163     PetscFunctionReturn(0);
164   }
165   if ((*cmp)(x, (arr) + r * size, ctx) > 0 || PetscUnlikely(!(r - l))) PetscFunctionReturn(0);
166   while (PETSC_TRUE) {
167     if (PetscUnlikely(cur < l)) {
168       cur = l;
169       break;
170     }
171     if ((*cmp)(x, (arr) + cur * size, ctx) > 0) break;
172     last = cur;
173     cur -= (k <<= 1) + 1;
174     ++k;
175   }
176   /* standard binary search but take last r-1 mid r-1 cur r-2 into account*/
177   while (last > cur + 1) {
178     mid = last - ((last - cur) >> 1);
179     if ((*cmp)(x, (arr) + mid * size, ctx) > 0) {
180       cur = mid;
181     } else {
182       last = mid;
183     }
184   }
185   *m = cur;
186   PetscFunctionReturn(0);
187 }
188 
189 /* Mergesort where size of left half <= size of right half, so mergesort is done left to right. Arr should be pointer to
190  complete array, left is first index of left array, mid is first index of right array, right is last index of right
191  array */
192 static inline PetscErrorCode PetscTimSortMergeLo_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right) {
193   PetscInt       i = 0, j = mid, k = left, gallopleft = 0, gallopright = 0;
194   const PetscInt llen = mid - left;
195 
196   PetscFunctionBegin;
197   Petsc_memcpy(tarr, arr + (left * size), llen * size);
198   while ((i < llen) && (j <= right)) {
199     if ((*cmp)(tarr + (i * size), arr + (j * size), ctx) < 0) {
200       Petsc_memcpy(arr + (k * size), tarr + (i * size), size);
201       ++k;
202       gallopright = 0;
203       if (++i < llen && ++gallopleft >= MIN_GALLOP_GLOBAL) {
204         PetscInt l1, l2, diff1, diff2;
205         do {
206           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
207           /* search temp for right[j], can move up to that of temp into arr immediately */
208           PetscCall(PetscGallopSearchLeft_Private(tarr, size, cmp, ctx, i, llen - 1, arr + (j * size), &l1));
209           diff1 = l1 - i;
210           Petsc_memcpy(arr + (k * size), tarr + (i * size), diff1 * size);
211           k += diff1;
212           i = l1;
213           /* search right for temp[i], can move up to that many of right into arr */
214           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, j, right, tarr + (i * size), &l2));
215           diff2 = l2 - j;
216           Petsc_memmove((arr) + k * size, (arr) + j * size, diff2 * size);
217           k += diff2;
218           j = l2;
219           if (i >= llen || j > right) break;
220         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
221         ++MIN_GALLOP_GLOBAL;
222       }
223     } else {
224       Petsc_memmove(arr + (k * size), arr + (j * size), size);
225       ++k;
226       gallopleft = 0;
227       if (++j <= right && ++gallopright >= MIN_GALLOP_GLOBAL) {
228         PetscInt l1, l2, diff1, diff2;
229         do {
230           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
231           /* search right for temp[i], can move up to that many of right into arr */
232           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, j, right, tarr + (i * size), &l2));
233           diff2 = l2 - j;
234           Petsc_memmove(arr + (k * size), arr + (j * size), diff2 * size);
235           k += diff2;
236           j = l2;
237           /* search temp for right[j], can copy up to that of temp into arr immediately */
238           PetscCall(PetscGallopSearchLeft_Private(tarr, size, cmp, ctx, i, llen - 1, arr + (j * size), &l1));
239           diff1 = l1 - i;
240           Petsc_memcpy(arr + (k * size), tarr + (i * size), diff1 * size);
241           k += diff1;
242           i = l1;
243           if (i >= llen || j > right) break;
244         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
245         ++MIN_GALLOP_GLOBAL;
246       }
247     }
248   }
249   if (i < llen) { Petsc_memcpy(arr + (k * size), tarr + (i * size), (llen - i) * size); }
250   PetscFunctionReturn(0);
251 }
252 
253 static inline PetscErrorCode PetscTimSortMergeLoWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right) {
254   PetscInt       i = 0, j = mid, k = left, gallopleft = 0, gallopright = 0;
255   const PetscInt llen = mid - left;
256 
257   PetscFunctionBegin;
258   Petsc_memcpy2(atarr, arr + (left * asize), llen * asize, btarr, barr + (left * bsize), llen * bsize);
259   while ((i < llen) && (j <= right)) {
260     if ((*cmp)(atarr + (i * asize), arr + (j * asize), ctx) < 0) {
261       Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), asize, barr + (k * bsize), btarr + (i * bsize), bsize);
262       ++k;
263       gallopright = 0;
264       if (++i < llen && ++gallopleft >= MIN_GALLOP_GLOBAL) {
265         PetscInt l1, l2, diff1, diff2;
266         do {
267           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
268           /* search temp for right[j], can move up to that of temp into arr immediately */
269           PetscCall(PetscGallopSearchLeft_Private(atarr, asize, cmp, ctx, i, llen - 1, arr + (j * asize), &l1));
270           diff1 = l1 - i;
271           Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), diff1 * asize, barr + (k * bsize), btarr + (i * bsize), diff1 * bsize);
272           k += diff1;
273           i = l1;
274           /* search right for temp[i], can move up to that many of right into arr */
275           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, j, right, atarr + (i * asize), &l2));
276           diff2 = l2 - j;
277           Petsc_memmove2(arr + (k * asize), arr + (j * asize), diff2 * asize, barr + (k * bsize), barr + (j * bsize), diff2 * bsize);
278           k += diff2;
279           j = l2;
280           if (i >= llen || j > right) break;
281         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
282         ++MIN_GALLOP_GLOBAL;
283       }
284     } else {
285       Petsc_memmove2(arr + (k * asize), arr + (j * asize), asize, barr + (k * bsize), barr + (j * bsize), bsize);
286       ++k;
287       gallopleft = 0;
288       if (++j <= right && ++gallopright >= MIN_GALLOP_GLOBAL) {
289         PetscInt l1, l2, diff1, diff2;
290         do {
291           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
292           /* search right for temp[i], can move up to that many of right into arr */
293           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, j, right, atarr + (i * asize), &l2));
294           diff2 = l2 - j;
295           Petsc_memmove2(arr + (k * asize), arr + (j * asize), diff2 * asize, barr + (k * bsize), barr + (j * bsize), diff2 * bsize);
296           k += diff2;
297           j = l2;
298           /* search temp for right[j], can copy up to that of temp into arr immediately */
299           PetscCall(PetscGallopSearchLeft_Private(atarr, asize, cmp, ctx, i, llen - 1, arr + (j * asize), &l1));
300           diff1 = l1 - i;
301           Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), diff1 * asize, barr + (k * bsize), btarr + (i * bsize), diff1 * bsize);
302           k += diff1;
303           i = l1;
304           if (i >= llen || j > right) break;
305         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
306         ++MIN_GALLOP_GLOBAL;
307       }
308     }
309   }
310   if (i < llen) { Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), (llen - i) * asize, barr + (k * bsize), btarr + (i * bsize), (llen - i) * bsize); }
311   PetscFunctionReturn(0);
312 }
313 
314 /* Mergesort where size of right half < size of left half, so mergesort is done right to left. Arr should be pointer to
315  complete array, left is first index of left array, mid is first index of right array, right is last index of right
316  array */
317 static inline PetscErrorCode PetscTimSortMergeHi_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right) {
318   PetscInt       i = right - mid, j = mid - 1, k = right, gallopleft = 0, gallopright = 0;
319   const PetscInt rlen = right - mid + 1;
320 
321   PetscFunctionBegin;
322   Petsc_memcpy(tarr, (arr) + mid * size, rlen * size);
323   while ((i >= 0) && (j >= left)) {
324     if ((*cmp)((tarr) + i * size, (arr) + j * size, ctx) > 0) {
325       Petsc_memcpy((arr) + k * size, (tarr) + i * size, size);
326       --k;
327       gallopleft = 0;
328       if (--i >= 0 && ++gallopright >= MIN_GALLOP_GLOBAL) {
329         PetscInt l1, l2, diff1, diff2;
330         do {
331           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
332           /* search temp for left[j], can copy up to that many of temp into arr */
333           PetscCall(PetscGallopSearchRight_Private(tarr, size, cmp, ctx, 0, i, (arr) + j * size, &l1));
334           diff1 = i - l1;
335           Petsc_memcpy((arr) + (k - diff1 + 1) * size, (tarr) + (l1 + 1) * size, diff1 * size);
336           k -= diff1;
337           i = l1;
338           /* search left for temp[i], can move up to that many of left up arr */
339           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, left, j, (tarr) + i * size, &l2));
340           diff2 = j - l2;
341           Petsc_memmove((arr) + (k - diff2 + 1) * size, (arr) + (l2 + 1) * size, diff2 * size);
342           k -= diff2;
343           j = l2;
344           if (i < 0 || j < left) break;
345         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
346         ++MIN_GALLOP_GLOBAL;
347       }
348     } else {
349       Petsc_memmove((arr) + k * size, (arr) + j * size, size);
350       --k;
351       gallopright = 0;
352       if (--j >= left && ++gallopleft >= MIN_GALLOP_GLOBAL) {
353         PetscInt l1, l2, diff1, diff2;
354         do {
355           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
356           /* search left for temp[i], can move up to that many of left up arr */
357           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, left, j, (tarr) + i * size, &l2));
358           diff2 = j - l2;
359           Petsc_memmove((arr) + (k - diff2 + 1) * size, (arr) + (l2 + 1) * size, diff2 * size);
360           k -= diff2;
361           j = l2;
362           /* search temp for left[j], can copy up to that many of temp into arr */
363           PetscCall(PetscGallopSearchRight_Private(tarr, size, cmp, ctx, 0, i, (arr) + j * size, &l1));
364           diff1 = i - l1;
365           Petsc_memcpy((arr) + (k - diff1 + 1) * size, (tarr) + (l1 + 1) * size, diff1 * size);
366           k -= diff1;
367           i = l1;
368           if (i < 0 || j < left) break;
369         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
370         ++MIN_GALLOP_GLOBAL;
371       }
372     }
373   }
374   if (i >= 0) { Petsc_memcpy((arr) + left * size, tarr, (i + 1) * size); }
375   PetscFunctionReturn(0);
376 }
377 
378 static inline PetscErrorCode PetscTimSortMergeHiWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right) {
379   PetscInt       i = right - mid, j = mid - 1, k = right, gallopleft = 0, gallopright = 0;
380   const PetscInt rlen = right - mid + 1;
381 
382   PetscFunctionBegin;
383   Petsc_memcpy2(atarr, (arr) + mid * asize, rlen * asize, btarr, (barr) + mid * bsize, rlen * bsize);
384   while ((i >= 0) && (j >= left)) {
385     if ((*cmp)((atarr) + i * asize, (arr) + j * asize, ctx) > 0) {
386       Petsc_memcpy2((arr) + k * asize, (atarr) + i * asize, asize, (barr) + k * bsize, (btarr) + i * bsize, bsize);
387       --k;
388       gallopleft = 0;
389       if (--i >= 0 && ++gallopright >= MIN_GALLOP_GLOBAL) {
390         PetscInt l1, l2, diff1, diff2;
391         do {
392           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
393           /* search temp for left[j], can copy up to that many of temp into arr */
394           PetscCall(PetscGallopSearchRight_Private(atarr, asize, cmp, ctx, 0, i, (arr) + j * asize, &l1));
395           diff1 = i - l1;
396           Petsc_memcpy2((arr) + (k - diff1 + 1) * asize, (atarr) + (l1 + 1) * asize, diff1 * asize, (barr) + (k - diff1 + 1) * bsize, (btarr) + (l1 + 1) * bsize, diff1 * bsize);
397           k -= diff1;
398           i = l1;
399           /* search left for temp[i], can move up to that many of left up arr */
400           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, left, j, (atarr) + i * asize, &l2));
401           diff2 = j - l2;
402           Petsc_memmove2((arr) + (k - diff2 + 1) * asize, (arr) + (l2 + 1) * asize, diff2 * asize, (barr) + (k - diff2 + 1) * bsize, (barr) + (l2 + 1) * bsize, diff2 * bsize);
403           k -= diff2;
404           j = l2;
405           if (i < 0 || j < left) break;
406         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
407         ++MIN_GALLOP_GLOBAL;
408       }
409     } else {
410       Petsc_memmove2((arr) + k * asize, (arr) + j * asize, asize, (barr) + k * bsize, (barr) + j * bsize, bsize);
411       --k;
412       gallopright = 0;
413       if (--j >= left && ++gallopleft >= MIN_GALLOP_GLOBAL) {
414         PetscInt l1, l2, diff1, diff2;
415         do {
416           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
417           /* search left for temp[i], can move up to that many of left up arr */
418           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, left, j, (atarr) + i * asize, &l2));
419           diff2 = j - l2;
420           Petsc_memmove2((arr) + (k - diff2 + 1) * asize, (arr) + (l2 + 1) * asize, diff2 * asize, (barr) + (k - diff2 + 1) * bsize, (barr) + (l2 + 1) * bsize, diff2 * bsize);
421           k -= diff2;
422           j = l2;
423           /* search temp for left[j], can copy up to that many of temp into arr */
424           PetscCall(PetscGallopSearchRight_Private(atarr, asize, cmp, ctx, 0, i, (arr) + j * asize, &l1));
425           diff1 = i - l1;
426           Petsc_memcpy2((arr) + (k - diff1 + 1) * asize, (atarr) + (l1 + 1) * asize, diff1 * asize, (barr) + (k - diff1 + 1) * bsize, (btarr) + (l1 + 1) * bsize, diff1 * bsize);
427           k -= diff1;
428           i = l1;
429           if (i < 0 || j < left) break;
430         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
431         ++MIN_GALLOP_GLOBAL;
432       }
433     }
434   }
435   if (i >= 0) { Petsc_memcpy2((arr) + left * asize, atarr, (i + 1) * asize, (barr) + left * bsize, btarr, (i + 1) * bsize); }
436   PetscFunctionReturn(0);
437 }
438 
439 /* Left is inclusive lower bound of array slice, start is start location of unsorted section, right is inclusive upper
440  bound of array slice. If unsure of where unsorted section starts or if entire length is unsorted pass start = left */
441 static inline PetscErrorCode PetscInsertionSort_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right) {
442   PetscInt i = start == left ? start + 1 : start;
443 
444   PetscFunctionBegin;
445   for (; i <= right; ++i) {
446     PetscInt j = i - 1;
447     Petsc_memcpy(tarr, arr + (i * size), size);
448     while ((j >= left) && ((*cmp)(tarr, (arr) + j * size, ctx) < 0)) {
449       COPYSWAPPY(arr + (j + 1) * size, arr + j * size, tarr + size, size);
450       --j;
451     }
452     Petsc_memcpy((arr) + (j + 1) * size, tarr, size);
453   }
454   PetscFunctionReturn(0);
455 }
456 
457 static inline PetscErrorCode PetscInsertionSortWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right) {
458   PetscInt i = start == left ? start + 1 : start;
459 
460   PetscFunctionBegin;
461   for (; i <= right; ++i) {
462     PetscInt j = i - 1;
463     Petsc_memcpy2(atarr, arr + (i * asize), asize, btarr, barr + (i * bsize), bsize);
464     while ((j >= left) && ((*cmp)(atarr, arr + (j * asize), ctx) < 0)) {
465       COPYSWAPPY2(arr + (j + 1) * asize, arr + j * asize, asize, barr + (j + 1) * bsize, barr + j * bsize, bsize, atarr + asize);
466       --j;
467     }
468     Petsc_memcpy2(arr + (j + 1) * asize, atarr, asize, barr + (j + 1) * bsize, btarr, bsize);
469   }
470   PetscFunctionReturn(0);
471 }
472 
473 /* See PetscInsertionSort_Private */
474 static inline PetscErrorCode PetscBinaryInsertionSort_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right) {
475   PetscInt i = start == left ? start + 1 : start;
476 
477   PetscFunctionBegin;
478   for (; i <= right; ++i) {
479     PetscInt l = left, r = i;
480     Petsc_memcpy(tarr, arr + (i * size), size);
481     do {
482       const PetscInt m = l + ((r - l) >> 1);
483       if ((*cmp)(tarr, arr + (m * size), ctx) < 0) {
484         r = m;
485       } else {
486         l = m + 1;
487       }
488     } while (l < r);
489     Petsc_memmove(arr + ((l + 1) * size), arr + (l * size), (i - l) * size);
490     Petsc_memcpy(arr + (l * size), tarr, size);
491   }
492   PetscFunctionReturn(0);
493 }
494 
495 static inline PetscErrorCode PetscBinaryInsertionSortWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right) {
496   PetscInt i = start == left ? start + 1 : start;
497 
498   PetscFunctionBegin;
499   for (; i <= right; ++i) {
500     PetscInt l = left, r = i;
501     Petsc_memcpy2(atarr, arr + (i * asize), asize, btarr, barr + (i * bsize), bsize);
502     do {
503       const PetscInt m = l + ((r - l) >> 1);
504       if ((*cmp)(atarr, arr + (m * asize), ctx) < 0) {
505         r = m;
506       } else {
507         l = m + 1;
508       }
509     } while (l < r);
510     Petsc_memmove2(arr + ((l + 1) * asize), arr + (l * asize), (i - l) * asize, barr + ((l + 1) * bsize), barr + (l * bsize), (i - l) * bsize);
511     Petsc_memcpy2(arr + (l * asize), atarr, asize, barr + (l * bsize), btarr, bsize);
512   }
513   PetscFunctionReturn(0);
514 }
515 
516 typedef struct {
517   PetscInt size;
518   PetscInt start;
519 #if defined(__CRAY_AARCH64) /* segfaults with Cray compilers for aarch64 on a64FX */
520 } PetscTimSortStack;
521 #else
522 } PetscTimSortStack PETSC_ATTRIBUTEALIGNED(2 * sizeof(PetscInt));
523 #endif
524 
525 typedef struct {
526   char *ptr PETSC_ATTRIBUTEALIGNED(PETSC_MEMALIGN);
527   size_t    size;
528   size_t    maxsize;
529 } PetscTimSortBuffer;
530 
531 static inline PetscErrorCode PetscTimSortResizeBuffer_Private(PetscTimSortBuffer *buff, size_t newSize) {
532   PetscFunctionBegin;
533   if (PetscLikely(newSize <= buff->size)) PetscFunctionReturn(0);
534   {
535     /* Can't be larger than n, there is merit to simply allocating buff to n to begin with */
536     size_t newMax = PetscMin(newSize * newSize, buff->maxsize);
537     PetscCall(PetscFree(buff->ptr));
538     PetscCall(PetscMalloc1(newMax, &buff->ptr));
539     buff->size = newMax;
540   }
541   PetscFunctionReturn(0);
542 }
543 
544 static inline PetscErrorCode PetscTimSortForceCollapse_Private(char *arr, size_t size, CompFunc cmp, void *ctx, PetscTimSortBuffer *buff, PetscTimSortStack *stack, PetscInt stacksize) {
545   PetscFunctionBegin;
546   for (; stacksize; --stacksize) {
547     /* A = stack[i-1], B = stack[i], if A[-1] <= B[0] means sorted */
548     if ((*cmp)(arr + (stack[stacksize].start - 1) * size, arr + (stack[stacksize].start) * size, ctx) > 0) {
549       PetscInt l, m = stack[stacksize].start, r;
550       /* Search A for B[0] insertion */
551       PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[stacksize - 1].start, stack[stacksize].start - 1, (arr) + (stack[stacksize].start) * size, &l));
552       /* Search B for A[-1] insertion */
553       PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[stacksize].start, stack[stacksize].start + stack[stacksize].size - 1, (arr) + (stack[stacksize].start - 1) * size, &r));
554       if (m - l <= r - m) {
555         PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
556         PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
557       } else {
558         PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
559         PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
560       }
561     }
562     /* Update A with merge */
563     stack[stacksize - 1].size += stack[stacksize].size;
564   }
565   PetscFunctionReturn(0);
566 }
567 
568 static inline PetscErrorCode PetscTimSortForceCollapseWithArray_Private(char *arr, size_t asize, char *barr, size_t bsize, CompFunc cmp, void *ctx, PetscTimSortBuffer *abuff, PetscTimSortBuffer *bbuff, PetscTimSortStack *stack, PetscInt stacksize) {
569   PetscFunctionBegin;
570   for (; stacksize; --stacksize) {
571     /* A = stack[i-1], B = stack[i], if A[-1] <= B[0] means sorted */
572     if ((*cmp)(arr + (stack[stacksize].start - 1) * asize, arr + (stack[stacksize].start) * asize, ctx) > 0) {
573       PetscInt l, m = stack[stacksize].start, r;
574       /* Search A for B[0] insertion */
575       PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[stacksize - 1].start, stack[stacksize].start - 1, (arr) + (stack[stacksize].start) * asize, &l));
576       /* Search B for A[-1] insertion */
577       PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[stacksize].start, stack[stacksize].start + stack[stacksize].size - 1, (arr) + (stack[stacksize].start - 1) * asize, &r));
578       if (m - l <= r - m) {
579         PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
580         PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
581         PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
582       } else {
583         PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
584         PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
585         PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
586       }
587     }
588     /* Update A with merge */
589     stack[stacksize - 1].size += stack[stacksize].size;
590   }
591   PetscFunctionReturn(0);
592 }
593 
594 static inline PetscErrorCode PetscTimSortMergeCollapse_Private(char *arr, size_t size, CompFunc cmp, void *ctx, PetscTimSortBuffer *buff, PetscTimSortStack *stack, PetscInt *stacksize) {
595   PetscInt i = *stacksize;
596 
597   PetscFunctionBegin;
598   while (i) {
599     PetscInt l, m, r, itemp = i;
600 
601     if (i == 1) {
602       /* A = stack[i-1], B = stack[i] */
603       if (stack[i - 1].size < stack[i].size) {
604         /* if A[-1] <= B[0] then sorted */
605         if ((*cmp)(arr + (stack[i].start - 1) * size, arr + (stack[i].start) * size, ctx) > 0) {
606           m = stack[i].start;
607           /* Search A for B[0] insertion */
608           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i - 1].start, stack[i].start - 1, (arr) + stack[i].start * size, &l));
609           /* Search B for A[-1] insertion */
610           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, arr + (stack[i].start - 1) * size, &r));
611           if (m - l <= r - m) {
612             PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
613             PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
614           } else {
615             PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
616             PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
617           }
618         }
619         /* Update A with merge */
620         stack[i - 1].size += stack[i].size;
621         --i;
622       }
623     } else {
624       /* i > 2, i.e. C exists
625        A = stack[i-2], B = stack[i-1], C = stack[i]; */
626       if (stack[i - 2].size <= stack[i - 1].size + stack[i].size) {
627         if (stack[i - 2].size < stack[i].size) {
628           /* merge B into A, but if A[-1] <= B[0] then already sorted */
629           if ((*cmp)(arr + (stack[i - 1].start - 1) * size, arr + (stack[i - 1].start) * size, ctx) > 0) {
630             m = stack[i - 1].start;
631             /* Search A for B[0] insertion */
632             PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i - 2].start, stack[i - 1].start - 1, (arr) + (stack[i - 1].start) * size, &l));
633             /* Search B for A[-1] insertion */
634             PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i - 1].start, stack[i - 1].start + stack[i - 1].size - 1, (arr) + (stack[i - 1].start - 1) * size, &r));
635             if (m - l <= r - m) {
636               PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
637               PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
638             } else {
639               PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
640               PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
641             }
642           }
643           /* Update A with merge */
644           stack[i - 2].size += stack[i - 1].size;
645           /* Push C up the stack */
646           stack[i - 1].start = stack[i].start;
647           stack[i - 1].size  = stack[i].size;
648         } else {
649         /* merge C into B */
650         mergeBC:
651           /* If B[-1] <= C[0] then... you know the drill */
652           if ((*cmp)(arr + (stack[i].start - 1) * size, arr + (stack[i].start) * size, ctx) > 0) {
653             m = stack[i].start;
654             /* Search B for C[0] insertion */
655             PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i - 1].start, stack[i].start - 1, arr + stack[i].start * size, &l));
656             /* Search C for B[-1] insertion */
657             PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, (arr) + (stack[i].start - 1) * size, &r));
658             if (m - l <= r - m) {
659               PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
660               PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
661             } else {
662               PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
663               PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
664             }
665           }
666           /* Update B with merge */
667           stack[i - 1].size += stack[i].size;
668         }
669         --i;
670       } else if (stack[i - 1].size <= stack[i].size) {
671         /* merge C into B */
672         goto mergeBC;
673       }
674     }
675     if (itemp == i) break;
676   }
677   *stacksize = i;
678   PetscFunctionReturn(0);
679 }
680 
681 static inline PetscErrorCode PetscTimSortMergeCollapseWithArray_Private(char *arr, size_t asize, char *barr, size_t bsize, CompFunc cmp, void *ctx, PetscTimSortBuffer *abuff, PetscTimSortBuffer *bbuff, PetscTimSortStack *stack, PetscInt *stacksize) {
682   PetscInt i = *stacksize;
683 
684   PetscFunctionBegin;
685   while (i) {
686     PetscInt l, m, r, itemp = i;
687 
688     if (i == 1) {
689       /* A = stack[i-1], B = stack[i] */
690       if (stack[i - 1].size < stack[i].size) {
691         /* if A[-1] <= B[0] then sorted */
692         if ((*cmp)(arr + (stack[i].start - 1) * asize, arr + (stack[i].start) * asize, ctx) > 0) {
693           m = stack[i].start;
694           /* Search A for B[0] insertion */
695           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i - 1].start, stack[i].start - 1, (arr) + stack[i].start * asize, &l));
696           /* Search B for A[-1] insertion */
697           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, arr + (stack[i].start - 1) * asize, &r));
698           if (m - l <= r - m) {
699             PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
700             PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
701             PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
702           } else {
703             PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
704             PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
705             PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
706           }
707         }
708         /* Update A with merge */
709         stack[i - 1].size += stack[i].size;
710         --i;
711       }
712     } else {
713       /* i > 2, i.e. C exists
714        A = stack[i-2], B = stack[i-1], C = stack[i]; */
715       if (stack[i - 2].size <= stack[i - 1].size + stack[i].size) {
716         if (stack[i - 2].size < stack[i].size) {
717           /* merge B into A, but if A[-1] <= B[0] then already sorted */
718           if ((*cmp)(arr + (stack[i - 1].start - 1) * asize, arr + (stack[i - 1].start) * asize, ctx) > 0) {
719             m = stack[i - 1].start;
720             /* Search A for B[0] insertion */
721             PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i - 2].start, stack[i - 1].start - 1, (arr) + (stack[i - 1].start) * asize, &l));
722             /* Search B for A[-1] insertion */
723             PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i - 1].start, stack[i - 1].start + stack[i - 1].size - 1, (arr) + (stack[i - 1].start - 1) * asize, &r));
724             if (m - l <= r - m) {
725               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
726               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
727               PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
728             } else {
729               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
730               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
731               PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
732             }
733           }
734           /* Update A with merge */
735           stack[i - 2].size += stack[i - 1].size;
736           /* Push C up the stack */
737           stack[i - 1].start = stack[i].start;
738           stack[i - 1].size  = stack[i].size;
739         } else {
740         /* merge C into B */
741         mergeBC:
742           /* If B[-1] <= C[0] then... you know the drill */
743           if ((*cmp)(arr + (stack[i].start - 1) * asize, arr + (stack[i].start) * asize, ctx) > 0) {
744             m = stack[i].start;
745             /* Search B for C[0] insertion */
746             PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i - 1].start, stack[i].start - 1, arr + stack[i].start * asize, &l));
747             /* Search C for B[-1] insertion */
748             PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, (arr) + (stack[i].start - 1) * asize, &r));
749             if (m - l <= r - m) {
750               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
751               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
752               PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
753             } else {
754               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
755               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
756               PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
757             }
758           }
759           /* Update B with merge */
760           stack[i - 1].size += stack[i].size;
761         }
762         --i;
763       } else if (stack[i - 1].size <= stack[i].size) {
764         /* merge C into B */
765         goto mergeBC;
766       }
767     }
768     if (itemp == i) break;
769   }
770   *stacksize = i;
771   PetscFunctionReturn(0);
772 }
773 
774 /* March sequentially through the array building up a "run" of weakly increasing or strictly decreasing contiguous
775  elements. Decreasing runs are reversed by swapping. If the run is less than minrun, artificially extend it via either
776  binary insertion sort or regulat insertion sort */
777 static inline PetscErrorCode PetscTimSortBuildRun_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt n, PetscInt minrun, PetscInt runstart, PetscInt *runend) {
778   const PetscInt re = PetscMin(runstart + minrun, n - 1);
779   PetscInt       ri = runstart;
780 
781   PetscFunctionBegin;
782   if (PetscUnlikely(runstart == n - 1)) {
783     *runend = runstart;
784     PetscFunctionReturn(0);
785   }
786   /* guess whether run is ascending or descending and tally up the longest consecutive run. essentially a coinflip for random data */
787   if ((*cmp)((arr) + (ri + 1) * size, (arr) + ri * size, ctx) < 0) {
788     ++ri;
789     while (ri < n - 1) {
790       if ((*cmp)((arr) + (ri + 1) * size, (arr) + ri * size, ctx) >= 0) break;
791       ++ri;
792     }
793     {
794       PetscInt lo = runstart, hi = ri;
795       for (; lo < hi; ++lo, --hi) { COPYSWAPPY(arr + lo * size, arr + hi * size, tarr, size); }
796     }
797   } else {
798     ++ri;
799     while (ri < n - 1) {
800       if ((*cmp)((arr) + (ri + 1) * size, (arr) + ri * size, ctx) < 0) break;
801       ++ri;
802     }
803   }
804   if (ri < re) {
805     /* the attempt failed, this section likely contains random data. If ri got close to minrun (within 50%) then we try
806      binary search */
807     if (ri - runstart <= minrun >> 1) {
808       ++MIN_GALLOP_GLOBAL; /* didn't get close hedge our bets against random data */
809       PetscInsertionSort_Private(arr, tarr, size, cmp, ctx, runstart, ri, re);
810     } else {
811       PetscBinaryInsertionSort_Private(arr, tarr, size, cmp, ctx, runstart, ri, re);
812     }
813     *runend = re;
814   } else *runend = ri;
815   PetscFunctionReturn(0);
816 }
817 
818 static inline PetscErrorCode PetscTimSortBuildRunWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt n, PetscInt minrun, PetscInt runstart, PetscInt *runend) {
819   const PetscInt re = PetscMin(runstart + minrun, n - 1);
820   PetscInt       ri = runstart;
821 
822   PetscFunctionBegin;
823   if (PetscUnlikely(runstart == n - 1)) {
824     *runend = runstart;
825     PetscFunctionReturn(0);
826   }
827   /* guess whether run is ascending or descending and tally up the longest consecutive run. essentially a coinflip for random data */
828   if ((*cmp)((arr) + (ri + 1) * asize, arr + (ri * asize), ctx) < 0) {
829     ++ri;
830     while (ri < n - 1) {
831       if ((*cmp)((arr) + (ri + 1) * asize, (arr) + ri * asize, ctx) >= 0) break;
832       ++ri;
833     }
834     {
835       PetscInt lo = runstart, hi = ri;
836       for (; lo < hi; ++lo, --hi) { COPYSWAPPY2(arr + lo * asize, arr + hi * asize, asize, barr + lo * bsize, barr + hi * bsize, bsize, atarr); }
837     }
838   } else {
839     ++ri;
840     while (ri < n - 1) {
841       if ((*cmp)((arr) + (ri + 1) * asize, (arr) + ri * asize, ctx) < 0) break;
842       ++ri;
843     }
844   }
845   if (ri < re) {
846     /* the attempt failed, this section likely contains random data. If ri got close to minrun (within 50%) then we try
847      binary search */
848     if (ri - runstart <= minrun >> 1) {
849       ++MIN_GALLOP_GLOBAL; /* didn't get close hedge our bets against random data */
850       PetscInsertionSortWithArray_Private(arr, atarr, asize, barr, btarr, bsize, cmp, ctx, runstart, ri, re);
851     } else {
852       PetscBinaryInsertionSortWithArray_Private(arr, atarr, asize, barr, btarr, bsize, cmp, ctx, runstart, ri, re);
853     }
854     *runend = re;
855   } else *runend = ri;
856   PetscFunctionReturn(0);
857 }
858 
859 /*@C
860   PetscTimSort - Sorts an array in place in increasing order using Tim Peters adaptive sorting algorithm.
861 
862   Not Collective
863 
864   Input Parameters:
865 + n    - number of values
866 . arr  - array to be sorted
867 . size - size in bytes of the datatype held in arr
868 . cmp  - function pointer to comparison function
869 - ctx  - optional context to be passed to comparison function, NULL if not needed
870 
871   Output Parameters:
872 . arr  - sorted array
873 
874   Notes:
875   Timsort makes the assumption that input data is already likely partially ordered, or that it contains contiguous
876  sections (termed 'runs') where the data is locally ordered (but not necessarily globally ordered). It therefore aims to
877  select slices of the array in such a way that resulting mergesorts operate on near perfectly length-balanced arrays. To
878  do so it repeatedly triggers attempts throughout to merge adjacent runs.
879 
880   Should one run continuously "win" a comparison the algorithm begins the "gallop" phase. It will aggressively search
881   the "winner" for the location of the "losers" next entry (and vice versa) to copy all preceding elements into place in
882   bulk. However if the data is truly unordered (as is the case with random data) the immense gains possible from these
883   searches are expected __not__ to repay their costs. While adjacent arrays are almost all nearly the same size, they
884   likely all contain similar data.
885 
886   Sample usage:
887   The comparison function must follow the qsort() comparison function paradigm, returning the sign of the difference
888   between its arguments. If left < right : return -1, if left == right : return 0, if left > right : return 1. The user
889   may also
890  change or reverse the order of the sort by flipping the above. Note that stability of the sort is only guaranteed if
891  the comparison function forms a valid trigraph. For example when sorting an array of type "my_type" in increasing
892   order
893 
894 .vb
895   int my_increasing_comparison_function(const void *left, const void *right, void *ctx) {
896     my_type l = *(my_type *) left, r = *(my_type *) right;
897     return (l < r) ? -1 : (l > r);
898   }
899 .ve
900   Note the context is unused here but you may use it to pass and subsequently access whatever information required
901   inside the comparison function. The context pointer will unaltered except for any changes made inside the comparison function.
902   Then pass the function
903 .vb
904   PetscTimSort(n, arr, sizeof(arr[0]), my_increasing_comparison_function, ctx)
905 .ve
906 
907   Fortran Notes:
908   To use this from fortran you must write a comparison subroutine with 4 arguments which accepts left, right, ctx and
909   returns result. For example
910 .vb
911  subroutine CompareIntegers(left,right,ctx,result)
912    implicit none
913 
914    PetscInt,intent(in) :: left, right
915    type(UserCtx)       :: ctx
916    integer,intent(out) :: result
917 
918    if (left < right) then
919      result = -1
920    else if (left == right) then
921      result = 0
922    else
923      result = 1
924    end if
925    return
926  end subroutine CompareIntegers
927 .ve
928 
929   References:
930 . * - Tim Peters. https://bugs.python.org/file4451/timsort.txt
931 
932   Level: developer
933 
934 .seealso: `PetscTimSortWithArray()`, `PetscIntSortSemiOrdered()`, `PetscRealSortSemiOrdered()`, `PetscMPIIntSortSemiOrdered()`
935 @*/
936 PetscErrorCode PetscTimSort(PetscInt n, void *arr, size_t size, int (*cmp)(const void *, const void *, void *), void *ctx) {
937   PetscInt           stacksize = 0, minrun, runstart = 0, runend = 0;
938   PetscTimSortStack  runstack[128];
939   PetscTimSortBuffer buff;
940   /* stacksize  = log_phi(n) = log_2(n)/log_2(phi), so 128 is enough for ~5.614e26 elements.
941    It is so unlikely that this limit is reached that this is __never__ checked for */
942 
943   PetscFunctionBegin;
944   /* Compute minrun. Minrun should be (32, 65) such that N/minrun
945    is a power of 2 or one plus a power of 2 */
946   {
947     PetscInt t = n, r = 0;
948     /* r becomes 1 if the least significant bits contain at least one off bit */
949     while (t >= 64) {
950       r |= t & 1;
951       t >>= 1;
952     }
953     minrun = t + r;
954   }
955   if (PetscDefined(USE_DEBUG)) {
956     PetscCall(PetscInfo(NULL, "minrun = %" PetscInt_FMT "\n", minrun));
957     if (n < 64) {
958       PetscCall(PetscInfo(NULL, "n %" PetscInt_FMT " < 64, consider using PetscSortInt() instead\n", n));
959     } else PetscCheck(!(minrun < 32) && !(minrun > 65), PETSC_COMM_SELF, PETSC_ERR_PLIB, "Calculated minrun %" PetscInt_FMT " not in range (32,65)", minrun);
960   }
961   PetscCall(PetscMalloc1((size_t)minrun * size, &buff.ptr));
962   buff.size         = (size_t)minrun * size;
963   buff.maxsize      = (size_t)n * size;
964   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
965   while (runstart < n) {
966     /* Check if additional entries are at least partially ordered and build natural run */
967     PetscCall(PetscTimSortBuildRun_Private((char *)arr, buff.ptr, size, cmp, ctx, n, minrun, runstart, &runend));
968     runstack[stacksize].start = runstart;
969     runstack[stacksize].size  = runend - runstart + 1;
970     PetscCall(PetscTimSortMergeCollapse_Private((char *)arr, size, cmp, ctx, &buff, runstack, &stacksize));
971     ++stacksize;
972     runstart = runend + 1;
973   }
974   /* Have been inside while, so discard last stacksize++ */
975   --stacksize;
976   PetscCall(PetscTimSortForceCollapse_Private((char *)arr, size, cmp, ctx, &buff, runstack, stacksize));
977   PetscCall(PetscFree(buff.ptr));
978   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
979   PetscFunctionReturn(0);
980 }
981 
982 /*@C
983   PetscTimSortWithArray - Sorts an array in place in increasing order using Tim Peters adaptive sorting algorithm and
984   reorders a second array to match the first. The arrays need not be the same type.
985 
986   Not Collective
987 
988   Input Parameters:
989 + n     - number of values
990 . asize - size in bytes of the datatype held in arr
991 . bsize - size in bytes of the datatype held in barr
992 . cmp   - function pointer to comparison function
993 - ctx   - optional context to be passed to comparison function, NULL if not needed
994 
995   Input/Output Parameters:
996 + arr  - array to be sorted, on output it is sorted
997 - barr - array to be reordered, on output it is reordered
998 
999   Notes:
1000   The arrays need not be of the same type, however barr MUST contain at least as many elements as arr and the two CANNOT
1001   overlap.
1002 
1003   Timsort makes the assumption that input data is already likely partially ordered, or that it contains contiguous
1004   sections (termed 'runs') where the data is locally ordered (but not necessarily globally ordered). It therefore aims
1005  to select slices of the array in such a way that resulting mergesorts operate on near perfectly length-balanced
1006  arrays. To do so it repeatedly triggers attempts throughout to merge adjacent runs.
1007 
1008   Should one run continuously "win" a comparison the algorithm begins the "gallop" phase. It will aggressively search
1009   the "winner" for the location of the "losers" next entry (and vice versa) to copy all preceding elements into place in
1010   bulk. However if the data is truly unordered (as is the case with random data) the immense gains possible from these
1011   searches are expected __not__ to repay their costs. While adjacent arrays are almost all nearly the same size, they
1012   likely all contain similar data.
1013 
1014   Sample usage:
1015   The comparison function must follow the qsort() comparison function paradigm, returning the sign of the difference
1016   between its arguments. If left < right : return -1, if left == right : return 0, if left > right : return 1. The user
1017   may also change or reverse the order of the sort by flipping the above. Note that stability of the sort is only
1018   guaranteed if the comparison function forms a valid trigraph. For example when sorting an array of type "my_type" in
1019   increasing order
1020 
1021 .vb
1022   int my_increasing_comparison_function(const void *left, const void *right, void *ctx) {
1023     my_type l = *(my_type *) left, r = *(my_type *) right;
1024     return (l < r) ? -1 : (l > r);
1025   }
1026 .ve
1027   Note the context is unused here but you may use it to pass and subsequently access whatever information required
1028   inside the comparison function. The context pointer will unaltered except for any changes made inside the comparison function.
1029   Then pass the function
1030 .vb
1031   PetscTimSortWithArray(n, arr, sizeof(arr[0]), barr, sizeof(barr[0]), my_increasing_comparison_function, ctx)
1032 .ve
1033 
1034   Fortran Notes:
1035   To use this from fortran you must write a comparison subroutine with 4 arguments which accepts left, right, ctx and
1036   returns result. For example
1037 .vb
1038  subroutine CompareIntegers(left,right,ctx,result)
1039    implicit none
1040 
1041    PetscInt,intent(in) :: left, right
1042    type(UserCtx)       :: ctx
1043    integer,intent(out) :: result
1044 
1045    if (left < right) then
1046      result = -1
1047    else if (left == right) then
1048      result = 0
1049    else
1050      result = 1
1051    end if
1052    return
1053  end subroutine CompareIntegers
1054 .ve
1055 
1056   References:
1057 . * - Tim Peters. https://bugs.python.org/file4451/timsort.txt
1058 
1059   Level: developer
1060 
1061 .seealso: `PetscTimSort()`, `PetscIntSortSemiOrderedWithArray()`, `PetscRealSortSemiOrderedWithArrayInt()`, `PetscMPIIntSortSemiOrderedWithArray()`
1062 @*/
1063 PetscErrorCode PetscTimSortWithArray(PetscInt n, void *arr, size_t asize, void *barr, size_t bsize, int (*cmp)(const void *, const void *, void *), void *ctx) {
1064   PetscInt           stacksize = 0, minrun, runstart = 0, runend = 0;
1065   PetscTimSortStack  runstack[128];
1066   PetscTimSortBuffer abuff, bbuff;
1067   /* stacksize  = log_phi(n) = log_2(n)/log_2(phi), so 128 is enough for ~5.614e26 elements.
1068    It is so unlikely that this limit is reached that this is __never__ checked for */
1069 
1070   PetscFunctionBegin;
1071   /* Compute minrun. Minrun should be (32, 65) such that N/minrun
1072    is a power of 2 or one plus a power of 2 */
1073   {
1074     PetscInt t = n, r = 0;
1075     /* r becomes 1 if the least significant bits contain at least one off bit */
1076     while (t >= 64) {
1077       r |= t & 1;
1078       t >>= 1;
1079     }
1080     minrun = t + r;
1081   }
1082   if (PetscDefined(USE_DEBUG)) {
1083     PetscCall(PetscInfo(NULL, "minrun = %" PetscInt_FMT "\n", minrun));
1084     PetscCheck(n < 64 || (minrun >= 32 && minrun <= 65), PETSC_COMM_SELF, PETSC_ERR_PLIB, "Calculated minrun %" PetscInt_FMT " not in range (32,65)", minrun);
1085   }
1086   PetscCall(PetscMalloc1((size_t)minrun * asize, &abuff.ptr));
1087   abuff.size    = (size_t)minrun * asize;
1088   abuff.maxsize = (size_t)n * asize;
1089   PetscCall(PetscMalloc1((size_t)minrun * bsize, &bbuff.ptr));
1090   bbuff.size        = (size_t)minrun * bsize;
1091   bbuff.maxsize     = (size_t)n * bsize;
1092   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
1093   while (runstart < n) {
1094     /* Check if additional entries are at least partially ordered and build natural run */
1095     PetscCall(PetscTimSortBuildRunWithArray_Private((char *)arr, abuff.ptr, asize, (char *)barr, bbuff.ptr, bsize, cmp, ctx, n, minrun, runstart, &runend));
1096     runstack[stacksize].start = runstart;
1097     runstack[stacksize].size  = runend - runstart + 1;
1098     PetscCall(PetscTimSortMergeCollapseWithArray_Private((char *)arr, asize, (char *)barr, bsize, cmp, ctx, &abuff, &bbuff, runstack, &stacksize));
1099     ++stacksize;
1100     runstart = runend + 1;
1101   }
1102   /* Have been inside while, so discard last stacksize++ */
1103   --stacksize;
1104   PetscCall(PetscTimSortForceCollapseWithArray_Private((char *)arr, asize, (char *)barr, bsize, cmp, ctx, &abuff, &bbuff, runstack, stacksize));
1105   PetscCall(PetscFree(abuff.ptr));
1106   PetscCall(PetscFree(bbuff.ptr));
1107   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
1108   PetscFunctionReturn(0);
1109 }
1110 
1111 /*@
1112    PetscIntSortSemiOrdered - Sorts an array of integers in place in increasing order.
1113 
1114    Not Collective
1115 
1116    Input Parameters:
1117 +  n   - number of values
1118 -  arr - array of integers
1119 
1120    Output Parameters:
1121 .  arr - sorted array of integers
1122 
1123    Notes:
1124    If the array is less than 64 entries long PetscSortInt() is automatically used.
1125 
1126    This function serves as an alternative to PetscSortInt(). While this function works for any array of integers it is
1127    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1128    recommended that the user benchmark their code to see which routine is fastest.
1129 
1130    Level: intermediate
1131 
1132 .seealso: `PetscTimSort()`, `PetscSortInt()`, `PetscSortIntWithPermutation()`
1133 @*/
1134 PetscErrorCode PetscIntSortSemiOrdered(PetscInt n, PetscInt arr[]) {
1135   PetscFunctionBegin;
1136   if (n <= 1) PetscFunctionReturn(0);
1137   PetscValidIntPointer(arr, 2);
1138   if (n < 64) {
1139     PetscCall(PetscSortInt(n, arr));
1140   } else {
1141     PetscCall(PetscTimSort(n, arr, sizeof(PetscInt), Compare_PetscInt_Private, NULL));
1142   }
1143   PetscFunctionReturn(0);
1144 }
1145 
1146 /*@
1147    PetscIntSortSemiOrderedWithArray - Sorts an array of integers in place in increasing order and reorders a second
1148    array to match the first.
1149 
1150    Not Collective
1151 
1152    Input Parameter:
1153 .  n   - number of values
1154 
1155    Input/Output Parameters:
1156 +  arr1 - array of integers to be sorted, modified on output
1157 -  arr2 - array of integers to be reordered, modified on output
1158 
1159    Notes:
1160    The arrays CANNOT overlap.
1161 
1162    This function serves as an alternative to PetscSortIntWithArray(). While this function works for any array of integers it is
1163    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1164    recommended that the user benchmark their code to see which routine is fastest.
1165 
1166    Level: intermediate
1167 
1168 .seealso: `PetscTimSortWithArray()`, `PetscSortIntWithArray()`, `PetscSortIntWithPermutation()`
1169 @*/
1170 PetscErrorCode PetscIntSortSemiOrderedWithArray(PetscInt n, PetscInt arr1[], PetscInt arr2[]) {
1171   PetscFunctionBegin;
1172   if (n <= 1) PetscFunctionReturn(0);
1173   PetscValidIntPointer(arr1, 2);
1174   PetscValidIntPointer(arr2, 3);
1175   /* cannot export out to PetscIntSortWithArray here since it isn't stable */
1176   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscInt), arr2, sizeof(PetscInt), Compare_PetscInt_Private, NULL));
1177   PetscFunctionReturn(0);
1178 }
1179 
1180 /*@
1181    PetscMPIIntSortSemiOrdered - Sorts an array of PetscMPIInts in place in increasing order.
1182 
1183    Not Collective
1184 
1185    Input Parameters:
1186 +  n   - number of values
1187 -  arr - array of PetscMPIInts
1188 
1189    Output Parameters:
1190 .  arr - sorted array of integers
1191 
1192    Notes:
1193    If the array is less than 64 entries long PetscSortMPIInt() is automatically used.
1194 
1195    This function serves as an alternative to PetscSortMPIInt(). While this function works for any array of PetscMPIInts it is
1196    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1197    recommended that the user benchmark their code to see which routine is fastest.
1198 
1199    Level: intermediate
1200 
1201 .seealso: `PetscTimSort()`, `PetscSortMPIInt()`
1202 @*/
1203 PetscErrorCode PetscMPIIntSortSemiOrdered(PetscInt n, PetscMPIInt arr[]) {
1204   PetscFunctionBegin;
1205   if (n <= 1) PetscFunctionReturn(0);
1206   PetscValidIntPointer(arr, 2);
1207   if (n < 64) {
1208     PetscCall(PetscSortMPIInt(n, arr));
1209   } else {
1210     PetscCall(PetscTimSort(n, arr, sizeof(PetscMPIInt), Compare_PetscMPIInt_Private, NULL));
1211   }
1212   PetscFunctionReturn(0);
1213 }
1214 
1215 /*@
1216    PetscMPIIntSortSemiOrderedWithArray - Sorts an array of integers in place in increasing order and reorders a second
1217    array to match the first.
1218 
1219    Not Collective
1220 
1221    Input Parameter:
1222 .  n   - number of values
1223 
1224    Input/Output Parameters:
1225 .  arr1 - array of integers to be sorted, modified on output
1226 -  arr2 - array of integers to be reordered, modified on output
1227 
1228    Notes:
1229    The arrays CANNOT overlap.
1230 
1231    This function serves as an alternative to PetscSortMPIIntWithArray(). While this function works for any array of integers it is
1232    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1233    recommended that the user benchmark their code to see which routine is fastest.
1234 
1235    Level: intermediate
1236 
1237 .seealso: `PetscTimSortWithArray()`, `PetscSortMPIIntWithArray()`, `PetscSortMPIIntWithPermutation()`
1238 @*/
1239 PetscErrorCode PetscMPIIntSortSemiOrderedWithArray(PetscInt n, PetscMPIInt arr1[], PetscMPIInt arr2[]) {
1240   PetscFunctionBegin;
1241   if (n <= 1) PetscFunctionReturn(0);
1242   PetscValidIntPointer(arr1, 2);
1243   PetscValidIntPointer(arr2, 3);
1244   /* cannot export out to PetscMPIIntSortWithArray here since it isn't stable */
1245   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscMPIInt), arr2, sizeof(PetscMPIInt), Compare_PetscMPIInt_Private, NULL));
1246   PetscFunctionReturn(0);
1247 }
1248 
1249 /*@
1250    PetscRealSortSemiOrdered - Sorts an array of PetscReals in place in increasing order.
1251 
1252    Not Collective
1253 
1254    Input Parameters:
1255 +  n   - number of values
1256 -  arr - array of PetscReals
1257 
1258    Output Parameters:
1259 .  arr - sorted array of integers
1260 
1261    Notes:
1262    If the array is less than 64 entries long PetscSortReal() is automatically used.
1263 
1264    This function serves as an alternative to PetscSortReal(). While this function works for any array of PetscReals it is
1265    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1266    recommended that the user benchmark their code to see which routine is fastest.
1267 
1268    Level: intermediate
1269 
1270 .seealso: `PetscTimSort()`, `PetscSortReal()`, `PetscSortRealWithPermutation()`
1271 @*/
1272 PetscErrorCode PetscRealSortSemiOrdered(PetscInt n, PetscReal arr[]) {
1273   PetscFunctionBegin;
1274   if (n <= 1) PetscFunctionReturn(0);
1275   PetscValidRealPointer(arr, 2);
1276   if (n < 64) {
1277     PetscCall(PetscSortReal(n, arr));
1278   } else {
1279     PetscCall(PetscTimSort(n, arr, sizeof(PetscReal), Compare_PetscReal_Private, NULL));
1280   }
1281   PetscFunctionReturn(0);
1282 }
1283 
1284 /*@
1285    PetscRealSortSemiOrderedWithArrayInt - Sorts an array of PetscReals in place in increasing order and reorders a second
1286    array of PetscInts to match the first.
1287 
1288    Not Collective
1289 
1290    Input Parameter:
1291 .  n   - number of values
1292 
1293    Input/Output Parameters:
1294 .  arr1 - array of PetscReals to be sorted, modified on output
1295 -  arr2 - array of PetscReals to be reordered, modified on output
1296 
1297    Notes:
1298    This function serves as an alternative to PetscSortRealWithArray(). While this function works for any array of PetscReals it is
1299    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1300    recommended that the user benchmark their code to see which routine is fastest.
1301 
1302    Level: intermediate
1303 
1304 .seealso: `PetscTimSortWithArray()`, `PetscSortRealWithArrayInt()`, `PetscSortRealWithPermutation()`
1305 @*/
1306 PetscErrorCode PetscRealSortSemiOrderedWithArrayInt(PetscInt n, PetscReal arr1[], PetscInt arr2[]) {
1307   PetscFunctionBegin;
1308   if (n <= 1) PetscFunctionReturn(0);
1309   PetscValidRealPointer(arr1, 2);
1310   PetscValidIntPointer(arr2, 3);
1311   /* cannot export out to PetscRealSortWithArrayInt here since it isn't stable */
1312   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscReal), arr2, sizeof(PetscInt), Compare_PetscReal_Private, NULL));
1313   PetscFunctionReturn(0);
1314 }
1315