xref: /petsc/src/sys/utils/sortso.c (revision 697336901c45ac77e1fd620fe1fca906cf3f95c8)
1 #include <petscsys.h> /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 static inline int Compare_PetscMPIInt_Private(const void *left, const void *right, PETSC_UNUSED void *ctx)
5 {
6   PetscMPIInt l = *(PetscMPIInt *)left, r = *(PetscMPIInt *)right;
7   return (l < r) ? -1 : (l > r);
8 }
9 
10 static inline int Compare_PetscInt_Private(const void *left, const void *right, PETSC_UNUSED void *ctx)
11 {
12   PetscInt l = *(PetscInt *)left, r = *(PetscInt *)right;
13   return (l < r) ? -1 : (l > r);
14 }
15 
16 static inline int Compare_PetscReal_Private(const void *left, const void *right, PETSC_UNUSED void *ctx)
17 {
18   PetscReal l = *(PetscReal *)left, r = *(PetscReal *)right;
19   return (l < r) ? -1 : (l > r);
20 }
21 
22 #define MIN_GALLOP_CONST_GLOBAL 8
23 static PetscInt MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
24 typedef int (*CompFunc)(const void *, const void *, void *);
25 
26 /* Mostly to force clang uses the builtins, but can't hurt to have gcc compilers also do the same */
27 #if !defined(PETSC_USE_DEBUG) && defined(__GNUC__)
28 static inline void COPYSWAPPY(char *a, char *b, char *t, size_t size)
29 {
30   __builtin_memcpy(t, b, size);
31   __builtin_memmove(b, a, size);
32   __builtin_memcpy(a, t, size);
33   return;
34 }
35 
36 static inline void COPYSWAPPY2(char *al, char *ar, size_t asize, char *bl, char *br, size_t bsize, char *t)
37 {
38   __builtin_memcpy(t, ar, asize);
39   __builtin_memmove(ar, al, asize);
40   __builtin_memcpy(al, t, asize);
41   __builtin_memcpy(t, br, bsize);
42   __builtin_memmove(br, bl, bsize);
43   __builtin_memcpy(bl, t, bsize);
44   return;
45 }
46 
47 static inline void Petsc_memcpy(char *dest, const char *src, size_t size)
48 {
49   __builtin_memcpy(dest, src, size);
50   return;
51 }
52 
53 static inline void Petsc_memcpy2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
54 {
55   __builtin_memcpy(adest, asrc, asize);
56   __builtin_memcpy(bdest, bsrc, bsize);
57   return;
58 }
59 
60 static inline void Petsc_memmove(char *dest, const char *src, size_t size)
61 {
62   __builtin_memmove(dest, src, size);
63   return;
64 }
65 
66 static inline void Petsc_memmove2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
67 {
68   __builtin_memmove(adest, asrc, asize);
69   __builtin_memmove(bdest, bsrc, bsize);
70   return;
71 }
72 #else
73 static inline void COPYSWAPPY(char *a, char *b, char *t, size_t size)
74 {
75   PetscFunctionBegin;
76   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(t, b, size));
77   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(b, a, size));
78   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(a, t, size));
79   PetscFunctionReturnVoid();
80 }
81 
82 static inline void COPYSWAPPY2(char *al, char *ar, size_t asize, char *bl, char *br, size_t bsize, char *t)
83 {
84   PetscFunctionBegin;
85   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(t, ar, asize));
86   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(ar, al, asize));
87   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(al, t, asize));
88   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(t, br, bsize));
89   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(br, bl, bsize));
90   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(bl, t, bsize));
91   PetscFunctionReturnVoid();
92 }
93 
94 static inline void Petsc_memcpy(char *dest, const char *src, size_t size)
95 {
96   PetscFunctionBegin;
97   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(dest, src, size));
98   PetscFunctionReturnVoid();
99 }
100 
101 static inline void Petsc_memcpy2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
102 {
103   PetscFunctionBegin;
104   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(adest, asrc, asize));
105   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(bdest, bsrc, bsize));
106   PetscFunctionReturnVoid();
107 }
108 
109 static inline void Petsc_memmove(char *dest, const char *src, size_t size)
110 {
111   PetscFunctionBegin;
112   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(dest, src, size));
113   PetscFunctionReturnVoid();
114 }
115 
116 static inline void Petsc_memmove2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
117 {
118   PetscFunctionBegin;
119   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(adest, asrc, asize));
120   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(bdest, bsrc, bsize));
121   PetscFunctionReturnVoid();
122 }
123 #endif
124 
125 /* Start left look right. Looking for e.g. B[0] in A or mergelo. l inclusive, r inclusive. Returns first m such that arr[m] >
126  x. Output also inclusive.
127 
128  NOTE: Both gallopsearch functions CANNOT distinguish between inserting AFTER the entire array vs inserting at entry n!! For example for an array:
129 
130    {0,1,2,3,4,5}
131 
132    when looking to insert "5" this routine will return *m = 6, but when looking to insert "6" it will ALSO return *m = 6.
133   */
134 static inline PetscErrorCode PetscGallopSearchLeft_Private(const char *arr, size_t size, CompFunc cmp, void *ctx, PetscInt l, PetscInt r, const char *x, PetscInt *m)
135 {
136   PetscInt last = l, k = 1, mid, cur = l + 1;
137 
138   PetscFunctionBegin;
139   *m = l;
140   PetscAssert(r >= l, PETSC_COMM_SELF, PETSC_ERR_PLIB, "r %" PetscInt_FMT " < l %" PetscInt_FMT " in PetscGallopSearchLeft", r, l);
141   if ((*cmp)(x, arr + r * size, ctx) >= 0) {
142     *m = r;
143     PetscFunctionReturn(PETSC_SUCCESS);
144   }
145   if ((*cmp)(x, (arr) + l * size, ctx) < 0 || PetscUnlikely(!(r - l))) PetscFunctionReturn(PETSC_SUCCESS);
146   while (PETSC_TRUE) {
147     if (PetscUnlikely(cur > r)) {
148       cur = r;
149       break;
150     }
151     if ((*cmp)(x, (arr) + cur * size, ctx) < 0) break;
152     last = cur;
153     cur += (k <<= 1) + 1;
154     ++k;
155   }
156   /* standard binary search but take last 0 mid 0 cur 1 into account*/
157   while (cur > last + 1) {
158     mid = last + ((cur - last) >> 1);
159     if ((*cmp)(x, (arr) + mid * size, ctx) < 0) {
160       cur = mid;
161     } else {
162       last = mid;
163     }
164   }
165   *m = cur;
166   PetscFunctionReturn(PETSC_SUCCESS);
167 }
168 
169 /* Start right look left. Looking for e.g. A[-1] in B or mergehi. l inclusive, r inclusive. Returns last m such that arr[m]
170  < x. Output also inclusive */
171 static inline PetscErrorCode PetscGallopSearchRight_Private(const char *arr, size_t size, CompFunc cmp, void *ctx, PetscInt l, PetscInt r, const char *x, PetscInt *m)
172 {
173   PetscInt last = r, k = 1, mid, cur = r - 1;
174 
175   PetscFunctionBegin;
176   *m = r;
177   PetscAssert(r >= l, PETSC_COMM_SELF, PETSC_ERR_PLIB, "r %" PetscInt_FMT " < l %" PetscInt_FMT " in PetscGallopSearchRight", r, l);
178   if ((*cmp)(x, arr + l * size, ctx) <= 0) {
179     *m = l;
180     PetscFunctionReturn(PETSC_SUCCESS);
181   }
182   if ((*cmp)(x, (arr) + r * size, ctx) > 0 || PetscUnlikely(!(r - l))) PetscFunctionReturn(PETSC_SUCCESS);
183   while (PETSC_TRUE) {
184     if (PetscUnlikely(cur < l)) {
185       cur = l;
186       break;
187     }
188     if ((*cmp)(x, (arr) + cur * size, ctx) > 0) break;
189     last = cur;
190     cur -= (k <<= 1) + 1;
191     ++k;
192   }
193   /* standard binary search but take last r-1 mid r-1 cur r-2 into account*/
194   while (last > cur + 1) {
195     mid = last - ((last - cur) >> 1);
196     if ((*cmp)(x, (arr) + mid * size, ctx) > 0) {
197       cur = mid;
198     } else {
199       last = mid;
200     }
201   }
202   *m = cur;
203   PetscFunctionReturn(PETSC_SUCCESS);
204 }
205 
206 /* Mergesort where size of left half <= size of right half, so mergesort is done left to right. Arr should be pointer to
207  complete array, left is first index of left array, mid is first index of right array, right is last index of right
208  array */
209 static inline PetscErrorCode PetscTimSortMergeLo_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
210 {
211   PetscInt       i = 0, j = mid, k = left, gallopleft = 0, gallopright = 0;
212   const PetscInt llen = mid - left;
213 
214   PetscFunctionBegin;
215   Petsc_memcpy(tarr, arr + (left * size), llen * size);
216   while ((i < llen) && (j <= right)) {
217     if ((*cmp)(tarr + (i * size), arr + (j * size), ctx) < 0) {
218       Petsc_memcpy(arr + (k * size), tarr + (i * size), size);
219       ++k;
220       gallopright = 0;
221       if (++i < llen && ++gallopleft >= MIN_GALLOP_GLOBAL) {
222         PetscInt l1, l2, diff1, diff2;
223         do {
224           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
225           /* search temp for right[j], can move up to that of temp into arr immediately */
226           PetscCall(PetscGallopSearchLeft_Private(tarr, size, cmp, ctx, i, llen - 1, arr + (j * size), &l1));
227           diff1 = l1 - i;
228           Petsc_memcpy(arr + (k * size), tarr + (i * size), diff1 * size);
229           k += diff1;
230           i = l1;
231           /* search right for temp[i], can move up to that many of right into arr */
232           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, j, right, tarr + (i * size), &l2));
233           diff2 = l2 - j;
234           Petsc_memmove((arr) + k * size, (arr) + j * size, diff2 * size);
235           k += diff2;
236           j = l2;
237           if (i >= llen || j > right) break;
238         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
239         ++MIN_GALLOP_GLOBAL;
240       }
241     } else {
242       Petsc_memmove(arr + (k * size), arr + (j * size), size);
243       ++k;
244       gallopleft = 0;
245       if (++j <= right && ++gallopright >= MIN_GALLOP_GLOBAL) {
246         PetscInt l1, l2, diff1, diff2;
247         do {
248           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
249           /* search right for temp[i], can move up to that many of right into arr */
250           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, j, right, tarr + (i * size), &l2));
251           diff2 = l2 - j;
252           Petsc_memmove(arr + (k * size), arr + (j * size), diff2 * size);
253           k += diff2;
254           j = l2;
255           /* search temp for right[j], can copy up to that of temp into arr immediately */
256           PetscCall(PetscGallopSearchLeft_Private(tarr, size, cmp, ctx, i, llen - 1, arr + (j * size), &l1));
257           diff1 = l1 - i;
258           Petsc_memcpy(arr + (k * size), tarr + (i * size), diff1 * size);
259           k += diff1;
260           i = l1;
261           if (i >= llen || j > right) break;
262         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
263         ++MIN_GALLOP_GLOBAL;
264       }
265     }
266   }
267   if (i < llen) Petsc_memcpy(arr + (k * size), tarr + (i * size), (llen - i) * size);
268   PetscFunctionReturn(PETSC_SUCCESS);
269 }
270 
271 static inline PetscErrorCode PetscTimSortMergeLoWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
272 {
273   PetscInt       i = 0, j = mid, k = left, gallopleft = 0, gallopright = 0;
274   const PetscInt llen = mid - left;
275 
276   PetscFunctionBegin;
277   Petsc_memcpy2(atarr, arr + (left * asize), llen * asize, btarr, barr + (left * bsize), llen * bsize);
278   while ((i < llen) && (j <= right)) {
279     if ((*cmp)(atarr + (i * asize), arr + (j * asize), ctx) < 0) {
280       Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), asize, barr + (k * bsize), btarr + (i * bsize), bsize);
281       ++k;
282       gallopright = 0;
283       if (++i < llen && ++gallopleft >= MIN_GALLOP_GLOBAL) {
284         PetscInt l1, l2, diff1, diff2;
285         do {
286           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
287           /* search temp for right[j], can move up to that of temp into arr immediately */
288           PetscCall(PetscGallopSearchLeft_Private(atarr, asize, cmp, ctx, i, llen - 1, arr + (j * asize), &l1));
289           diff1 = l1 - i;
290           Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), diff1 * asize, barr + (k * bsize), btarr + (i * bsize), diff1 * bsize);
291           k += diff1;
292           i = l1;
293           /* search right for temp[i], can move up to that many of right into arr */
294           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, j, right, atarr + (i * asize), &l2));
295           diff2 = l2 - j;
296           Petsc_memmove2(arr + (k * asize), arr + (j * asize), diff2 * asize, barr + (k * bsize), barr + (j * bsize), diff2 * bsize);
297           k += diff2;
298           j = l2;
299           if (i >= llen || j > right) break;
300         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
301         ++MIN_GALLOP_GLOBAL;
302       }
303     } else {
304       Petsc_memmove2(arr + (k * asize), arr + (j * asize), asize, barr + (k * bsize), barr + (j * bsize), bsize);
305       ++k;
306       gallopleft = 0;
307       if (++j <= right && ++gallopright >= MIN_GALLOP_GLOBAL) {
308         PetscInt l1, l2, diff1, diff2;
309         do {
310           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
311           /* search right for temp[i], can move up to that many of right into arr */
312           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, j, right, atarr + (i * asize), &l2));
313           diff2 = l2 - j;
314           Petsc_memmove2(arr + (k * asize), arr + (j * asize), diff2 * asize, barr + (k * bsize), barr + (j * bsize), diff2 * bsize);
315           k += diff2;
316           j = l2;
317           /* search temp for right[j], can copy up to that of temp into arr immediately */
318           PetscCall(PetscGallopSearchLeft_Private(atarr, asize, cmp, ctx, i, llen - 1, arr + (j * asize), &l1));
319           diff1 = l1 - i;
320           Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), diff1 * asize, barr + (k * bsize), btarr + (i * bsize), diff1 * bsize);
321           k += diff1;
322           i = l1;
323           if (i >= llen || j > right) break;
324         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
325         ++MIN_GALLOP_GLOBAL;
326       }
327     }
328   }
329   if (i < llen) Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), (llen - i) * asize, barr + (k * bsize), btarr + (i * bsize), (llen - i) * bsize);
330   PetscFunctionReturn(PETSC_SUCCESS);
331 }
332 
333 /* Mergesort where size of right half < size of left half, so mergesort is done right to left. Arr should be pointer to
334  complete array, left is first index of left array, mid is first index of right array, right is last index of right
335  array */
336 static inline PetscErrorCode PetscTimSortMergeHi_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
337 {
338   PetscInt       i = right - mid, j = mid - 1, k = right, gallopleft = 0, gallopright = 0;
339   const PetscInt rlen = right - mid + 1;
340 
341   PetscFunctionBegin;
342   Petsc_memcpy(tarr, (arr) + mid * size, rlen * size);
343   while ((i >= 0) && (j >= left)) {
344     if ((*cmp)((tarr) + i * size, (arr) + j * size, ctx) > 0) {
345       Petsc_memcpy((arr) + k * size, (tarr) + i * size, size);
346       --k;
347       gallopleft = 0;
348       if (--i >= 0 && ++gallopright >= MIN_GALLOP_GLOBAL) {
349         PetscInt l1, l2, diff1, diff2;
350         do {
351           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
352           /* search temp for left[j], can copy up to that many of temp into arr */
353           PetscCall(PetscGallopSearchRight_Private(tarr, size, cmp, ctx, 0, i, (arr) + j * size, &l1));
354           diff1 = i - l1;
355           Petsc_memcpy((arr) + (k - diff1 + 1) * size, (tarr) + (l1 + 1) * size, diff1 * size);
356           k -= diff1;
357           i = l1;
358           /* search left for temp[i], can move up to that many of left up arr */
359           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, left, j, (tarr) + i * size, &l2));
360           diff2 = j - l2;
361           Petsc_memmove((arr) + (k - diff2 + 1) * size, (arr) + (l2 + 1) * size, diff2 * size);
362           k -= diff2;
363           j = l2;
364           if (i < 0 || j < left) break;
365         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
366         ++MIN_GALLOP_GLOBAL;
367       }
368     } else {
369       Petsc_memmove((arr) + k * size, (arr) + j * size, size);
370       --k;
371       gallopright = 0;
372       if (--j >= left && ++gallopleft >= MIN_GALLOP_GLOBAL) {
373         PetscInt l1, l2, diff1, diff2;
374         do {
375           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
376           /* search left for temp[i], can move up to that many of left up arr */
377           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, left, j, (tarr) + i * size, &l2));
378           diff2 = j - l2;
379           Petsc_memmove((arr) + (k - diff2 + 1) * size, (arr) + (l2 + 1) * size, diff2 * size);
380           k -= diff2;
381           j = l2;
382           /* search temp for left[j], can copy up to that many of temp into arr */
383           PetscCall(PetscGallopSearchRight_Private(tarr, size, cmp, ctx, 0, i, (arr) + j * size, &l1));
384           diff1 = i - l1;
385           Petsc_memcpy((arr) + (k - diff1 + 1) * size, (tarr) + (l1 + 1) * size, diff1 * size);
386           k -= diff1;
387           i = l1;
388           if (i < 0 || j < left) break;
389         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
390         ++MIN_GALLOP_GLOBAL;
391       }
392     }
393   }
394   if (i >= 0) Petsc_memcpy((arr) + left * size, tarr, (i + 1) * size);
395   PetscFunctionReturn(PETSC_SUCCESS);
396 }
397 
398 static inline PetscErrorCode PetscTimSortMergeHiWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
399 {
400   PetscInt       i = right - mid, j = mid - 1, k = right, gallopleft = 0, gallopright = 0;
401   const PetscInt rlen = right - mid + 1;
402 
403   PetscFunctionBegin;
404   Petsc_memcpy2(atarr, (arr) + mid * asize, rlen * asize, btarr, (barr) + mid * bsize, rlen * bsize);
405   while ((i >= 0) && (j >= left)) {
406     if ((*cmp)((atarr) + i * asize, (arr) + j * asize, ctx) > 0) {
407       Petsc_memcpy2((arr) + k * asize, (atarr) + i * asize, asize, (barr) + k * bsize, (btarr) + i * bsize, bsize);
408       --k;
409       gallopleft = 0;
410       if (--i >= 0 && ++gallopright >= MIN_GALLOP_GLOBAL) {
411         PetscInt l1, l2, diff1, diff2;
412         do {
413           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
414           /* search temp for left[j], can copy up to that many of temp into arr */
415           PetscCall(PetscGallopSearchRight_Private(atarr, asize, cmp, ctx, 0, i, (arr) + j * asize, &l1));
416           diff1 = i - l1;
417           Petsc_memcpy2((arr) + (k - diff1 + 1) * asize, (atarr) + (l1 + 1) * asize, diff1 * asize, (barr) + (k - diff1 + 1) * bsize, (btarr) + (l1 + 1) * bsize, diff1 * bsize);
418           k -= diff1;
419           i = l1;
420           /* search left for temp[i], can move up to that many of left up arr */
421           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, left, j, (atarr) + i * asize, &l2));
422           diff2 = j - l2;
423           Petsc_memmove2((arr) + (k - diff2 + 1) * asize, (arr) + (l2 + 1) * asize, diff2 * asize, (barr) + (k - diff2 + 1) * bsize, (barr) + (l2 + 1) * bsize, diff2 * bsize);
424           k -= diff2;
425           j = l2;
426           if (i < 0 || j < left) break;
427         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
428         ++MIN_GALLOP_GLOBAL;
429       }
430     } else {
431       Petsc_memmove2((arr) + k * asize, (arr) + j * asize, asize, (barr) + k * bsize, (barr) + j * bsize, bsize);
432       --k;
433       gallopright = 0;
434       if (--j >= left && ++gallopleft >= MIN_GALLOP_GLOBAL) {
435         PetscInt l1, l2, diff1, diff2;
436         do {
437           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
438           /* search left for temp[i], can move up to that many of left up arr */
439           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, left, j, (atarr) + i * asize, &l2));
440           diff2 = j - l2;
441           Petsc_memmove2((arr) + (k - diff2 + 1) * asize, (arr) + (l2 + 1) * asize, diff2 * asize, (barr) + (k - diff2 + 1) * bsize, (barr) + (l2 + 1) * bsize, diff2 * bsize);
442           k -= diff2;
443           j = l2;
444           /* search temp for left[j], can copy up to that many of temp into arr */
445           PetscCall(PetscGallopSearchRight_Private(atarr, asize, cmp, ctx, 0, i, (arr) + j * asize, &l1));
446           diff1 = i - l1;
447           Petsc_memcpy2((arr) + (k - diff1 + 1) * asize, (atarr) + (l1 + 1) * asize, diff1 * asize, (barr) + (k - diff1 + 1) * bsize, (btarr) + (l1 + 1) * bsize, diff1 * bsize);
448           k -= diff1;
449           i = l1;
450           if (i < 0 || j < left) break;
451         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
452         ++MIN_GALLOP_GLOBAL;
453       }
454     }
455   }
456   if (i >= 0) Petsc_memcpy2((arr) + left * asize, atarr, (i + 1) * asize, (barr) + left * bsize, btarr, (i + 1) * bsize);
457   PetscFunctionReturn(PETSC_SUCCESS);
458 }
459 
460 /* Left is inclusive lower bound of array slice, start is start location of unsorted section, right is inclusive upper
461  bound of array slice. If unsure of where unsorted section starts or if entire length is unsorted pass start = left */
462 static inline PetscErrorCode PetscInsertionSort_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
463 {
464   PetscInt i = start == left ? start + 1 : start;
465 
466   PetscFunctionBegin;
467   for (; i <= right; ++i) {
468     PetscInt j = i - 1;
469     Petsc_memcpy(tarr, arr + (i * size), size);
470     while ((j >= left) && ((*cmp)(tarr, (arr) + j * size, ctx) < 0)) {
471       COPYSWAPPY(arr + (j + 1) * size, arr + j * size, tarr + size, size);
472       --j;
473     }
474     Petsc_memcpy((arr) + (j + 1) * size, tarr, size);
475   }
476   PetscFunctionReturn(PETSC_SUCCESS);
477 }
478 
479 static inline PetscErrorCode PetscInsertionSortWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
480 {
481   PetscInt i = start == left ? start + 1 : start;
482 
483   PetscFunctionBegin;
484   for (; i <= right; ++i) {
485     PetscInt j = i - 1;
486     Petsc_memcpy2(atarr, arr + (i * asize), asize, btarr, barr + (i * bsize), bsize);
487     while ((j >= left) && ((*cmp)(atarr, arr + (j * asize), ctx) < 0)) {
488       COPYSWAPPY2(arr + (j + 1) * asize, arr + j * asize, asize, barr + (j + 1) * bsize, barr + j * bsize, bsize, atarr + asize);
489       --j;
490     }
491     Petsc_memcpy2(arr + (j + 1) * asize, atarr, asize, barr + (j + 1) * bsize, btarr, bsize);
492   }
493   PetscFunctionReturn(PETSC_SUCCESS);
494 }
495 
496 /* See PetscInsertionSort_Private */
497 static inline PetscErrorCode PetscBinaryInsertionSort_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
498 {
499   PetscInt i = start == left ? start + 1 : start;
500 
501   PetscFunctionBegin;
502   for (; i <= right; ++i) {
503     PetscInt l = left, r = i;
504     Petsc_memcpy(tarr, arr + (i * size), size);
505     do {
506       const PetscInt m = l + ((r - l) >> 1);
507       if ((*cmp)(tarr, arr + (m * size), ctx) < 0) {
508         r = m;
509       } else {
510         l = m + 1;
511       }
512     } while (l < r);
513     Petsc_memmove(arr + ((l + 1) * size), arr + (l * size), (i - l) * size);
514     Petsc_memcpy(arr + (l * size), tarr, size);
515   }
516   PetscFunctionReturn(PETSC_SUCCESS);
517 }
518 
519 static inline PetscErrorCode PetscBinaryInsertionSortWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
520 {
521   PetscInt i = start == left ? start + 1 : start;
522 
523   PetscFunctionBegin;
524   for (; i <= right; ++i) {
525     PetscInt l = left, r = i;
526     Petsc_memcpy2(atarr, arr + (i * asize), asize, btarr, barr + (i * bsize), bsize);
527     do {
528       const PetscInt m = l + ((r - l) >> 1);
529       if ((*cmp)(atarr, arr + (m * asize), ctx) < 0) {
530         r = m;
531       } else {
532         l = m + 1;
533       }
534     } while (l < r);
535     Petsc_memmove2(arr + ((l + 1) * asize), arr + (l * asize), (i - l) * asize, barr + ((l + 1) * bsize), barr + (l * bsize), (i - l) * bsize);
536     Petsc_memcpy2(arr + (l * asize), atarr, asize, barr + (l * bsize), btarr, bsize);
537   }
538   PetscFunctionReturn(PETSC_SUCCESS);
539 }
540 
541 typedef struct {
542   PetscInt size;
543   PetscInt start;
544 #if defined(__CRAY_AARCH64) /* segfaults with Cray compilers for aarch64 on a64FX */
545 } PetscTimSortStack;
546 #else
547 } PetscTimSortStack PETSC_ATTRIBUTEALIGNED(2 * sizeof(PetscInt));
548 #endif
549 
550 typedef struct {
551   char *ptr PETSC_ATTRIBUTEALIGNED(PETSC_MEMALIGN);
552   size_t    size;
553   size_t    maxsize;
554 } PetscTimSortBuffer;
555 
556 static inline PetscErrorCode PetscTimSortResizeBuffer_Private(PetscTimSortBuffer *buff, size_t newSize)
557 {
558   PetscFunctionBegin;
559   if (PetscLikely(newSize <= buff->size)) PetscFunctionReturn(PETSC_SUCCESS);
560   {
561     /* Can't be larger than n, there is merit to simply allocating buff to n to begin with */
562     size_t newMax = PetscMin(newSize * newSize, buff->maxsize);
563     PetscCall(PetscFree(buff->ptr));
564     PetscCall(PetscMalloc1(newMax, &buff->ptr));
565     buff->size = newMax;
566   }
567   PetscFunctionReturn(PETSC_SUCCESS);
568 }
569 
570 static inline PetscErrorCode PetscTimSortForceCollapse_Private(char *arr, size_t size, CompFunc cmp, void *ctx, PetscTimSortBuffer *buff, PetscTimSortStack *stack, PetscInt stacksize)
571 {
572   PetscFunctionBegin;
573   for (; stacksize; --stacksize) {
574     /* A = stack[i-1], B = stack[i], if A[-1] <= B[0] means sorted */
575     if ((*cmp)(arr + (stack[stacksize].start - 1) * size, arr + (stack[stacksize].start) * size, ctx) > 0) {
576       PetscInt l, m = stack[stacksize].start, r;
577       /* Search A for B[0] insertion */
578       PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[stacksize - 1].start, stack[stacksize].start - 1, (arr) + (stack[stacksize].start) * size, &l));
579       /* Search B for A[-1] insertion */
580       PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[stacksize].start, stack[stacksize].start + stack[stacksize].size - 1, (arr) + (stack[stacksize].start - 1) * size, &r));
581       if (m - l <= r - m) {
582         PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
583         PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
584       } else {
585         PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
586         PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
587       }
588     }
589     /* Update A with merge */
590     stack[stacksize - 1].size += stack[stacksize].size;
591   }
592   PetscFunctionReturn(PETSC_SUCCESS);
593 }
594 
595 static inline PetscErrorCode PetscTimSortForceCollapseWithArray_Private(char *arr, size_t asize, char *barr, size_t bsize, CompFunc cmp, void *ctx, PetscTimSortBuffer *abuff, PetscTimSortBuffer *bbuff, PetscTimSortStack *stack, PetscInt stacksize)
596 {
597   PetscFunctionBegin;
598   for (; stacksize; --stacksize) {
599     /* A = stack[i-1], B = stack[i], if A[-1] <= B[0] means sorted */
600     if ((*cmp)(arr + (stack[stacksize].start - 1) * asize, arr + (stack[stacksize].start) * asize, ctx) > 0) {
601       PetscInt l, m = stack[stacksize].start, r;
602       /* Search A for B[0] insertion */
603       PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[stacksize - 1].start, stack[stacksize].start - 1, (arr) + (stack[stacksize].start) * asize, &l));
604       /* Search B for A[-1] insertion */
605       PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[stacksize].start, stack[stacksize].start + stack[stacksize].size - 1, (arr) + (stack[stacksize].start - 1) * asize, &r));
606       if (m - l <= r - m) {
607         PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
608         PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
609         PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
610       } else {
611         PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
612         PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
613         PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
614       }
615     }
616     /* Update A with merge */
617     stack[stacksize - 1].size += stack[stacksize].size;
618   }
619   PetscFunctionReturn(PETSC_SUCCESS);
620 }
621 
622 static inline PetscErrorCode PetscTimSortMergeCollapse_Private(char *arr, size_t size, CompFunc cmp, void *ctx, PetscTimSortBuffer *buff, PetscTimSortStack *stack, PetscInt *stacksize)
623 {
624   PetscInt i = *stacksize;
625 
626   PetscFunctionBegin;
627   while (i) {
628     PetscInt l, m, r, itemp = i;
629 
630     if (i == 1) {
631       /* A = stack[i-1], B = stack[i] */
632       if (stack[i - 1].size < stack[i].size) {
633         /* if A[-1] <= B[0] then sorted */
634         if ((*cmp)(arr + (stack[i].start - 1) * size, arr + (stack[i].start) * size, ctx) > 0) {
635           m = stack[i].start;
636           /* Search A for B[0] insertion */
637           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i - 1].start, stack[i].start - 1, (arr) + stack[i].start * size, &l));
638           /* Search B for A[-1] insertion */
639           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, arr + (stack[i].start - 1) * size, &r));
640           if (m - l <= r - m) {
641             PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
642             PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
643           } else {
644             PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
645             PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
646           }
647         }
648         /* Update A with merge */
649         stack[i - 1].size += stack[i].size;
650         --i;
651       }
652     } else {
653       /* i > 2, i.e. C exists
654        A = stack[i-2], B = stack[i-1], C = stack[i]; */
655       if (stack[i - 2].size <= stack[i - 1].size + stack[i].size) {
656         if (stack[i - 2].size < stack[i].size) {
657           /* merge B into A, but if A[-1] <= B[0] then already sorted */
658           if ((*cmp)(arr + (stack[i - 1].start - 1) * size, arr + (stack[i - 1].start) * size, ctx) > 0) {
659             m = stack[i - 1].start;
660             /* Search A for B[0] insertion */
661             PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i - 2].start, stack[i - 1].start - 1, (arr) + (stack[i - 1].start) * size, &l));
662             /* Search B for A[-1] insertion */
663             PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i - 1].start, stack[i - 1].start + stack[i - 1].size - 1, (arr) + (stack[i - 1].start - 1) * size, &r));
664             if (m - l <= r - m) {
665               PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
666               PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
667             } else {
668               PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
669               PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
670             }
671           }
672           /* Update A with merge */
673           stack[i - 2].size += stack[i - 1].size;
674           /* Push C up the stack */
675           stack[i - 1].start = stack[i].start;
676           stack[i - 1].size  = stack[i].size;
677         } else {
678         /* merge C into B */
679         mergeBC:
680           /* If B[-1] <= C[0] then... you know the drill */
681           if ((*cmp)(arr + (stack[i].start - 1) * size, arr + (stack[i].start) * size, ctx) > 0) {
682             m = stack[i].start;
683             /* Search B for C[0] insertion */
684             PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i - 1].start, stack[i].start - 1, arr + stack[i].start * size, &l));
685             /* Search C for B[-1] insertion */
686             PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, (arr) + (stack[i].start - 1) * size, &r));
687             if (m - l <= r - m) {
688               PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
689               PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
690             } else {
691               PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
692               PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
693             }
694           }
695           /* Update B with merge */
696           stack[i - 1].size += stack[i].size;
697         }
698         --i;
699       } else if (stack[i - 1].size <= stack[i].size) {
700         /* merge C into B */
701         goto mergeBC;
702       }
703     }
704     if (itemp == i) break;
705   }
706   *stacksize = i;
707   PetscFunctionReturn(PETSC_SUCCESS);
708 }
709 
710 static inline PetscErrorCode PetscTimSortMergeCollapseWithArray_Private(char *arr, size_t asize, char *barr, size_t bsize, CompFunc cmp, void *ctx, PetscTimSortBuffer *abuff, PetscTimSortBuffer *bbuff, PetscTimSortStack *stack, PetscInt *stacksize)
711 {
712   PetscInt i = *stacksize;
713 
714   PetscFunctionBegin;
715   while (i) {
716     PetscInt l, m, r, itemp = i;
717 
718     if (i == 1) {
719       /* A = stack[i-1], B = stack[i] */
720       if (stack[i - 1].size < stack[i].size) {
721         /* if A[-1] <= B[0] then sorted */
722         if ((*cmp)(arr + (stack[i].start - 1) * asize, arr + (stack[i].start) * asize, ctx) > 0) {
723           m = stack[i].start;
724           /* Search A for B[0] insertion */
725           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i - 1].start, stack[i].start - 1, (arr) + stack[i].start * asize, &l));
726           /* Search B for A[-1] insertion */
727           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, arr + (stack[i].start - 1) * asize, &r));
728           if (m - l <= r - m) {
729             PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
730             PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
731             PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
732           } else {
733             PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
734             PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
735             PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
736           }
737         }
738         /* Update A with merge */
739         stack[i - 1].size += stack[i].size;
740         --i;
741       }
742     } else {
743       /* i > 2, i.e. C exists
744        A = stack[i-2], B = stack[i-1], C = stack[i]; */
745       if (stack[i - 2].size <= stack[i - 1].size + stack[i].size) {
746         if (stack[i - 2].size < stack[i].size) {
747           /* merge B into A, but if A[-1] <= B[0] then already sorted */
748           if ((*cmp)(arr + (stack[i - 1].start - 1) * asize, arr + (stack[i - 1].start) * asize, ctx) > 0) {
749             m = stack[i - 1].start;
750             /* Search A for B[0] insertion */
751             PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i - 2].start, stack[i - 1].start - 1, (arr) + (stack[i - 1].start) * asize, &l));
752             /* Search B for A[-1] insertion */
753             PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i - 1].start, stack[i - 1].start + stack[i - 1].size - 1, (arr) + (stack[i - 1].start - 1) * asize, &r));
754             if (m - l <= r - m) {
755               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
756               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
757               PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
758             } else {
759               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
760               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
761               PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
762             }
763           }
764           /* Update A with merge */
765           stack[i - 2].size += stack[i - 1].size;
766           /* Push C up the stack */
767           stack[i - 1].start = stack[i].start;
768           stack[i - 1].size  = stack[i].size;
769         } else {
770         /* merge C into B */
771         mergeBC:
772           /* If B[-1] <= C[0] then... you know the drill */
773           if ((*cmp)(arr + (stack[i].start - 1) * asize, arr + (stack[i].start) * asize, ctx) > 0) {
774             m = stack[i].start;
775             /* Search B for C[0] insertion */
776             PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i - 1].start, stack[i].start - 1, arr + stack[i].start * asize, &l));
777             /* Search C for B[-1] insertion */
778             PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, (arr) + (stack[i].start - 1) * asize, &r));
779             if (m - l <= r - m) {
780               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
781               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
782               PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
783             } else {
784               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
785               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
786               PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
787             }
788           }
789           /* Update B with merge */
790           stack[i - 1].size += stack[i].size;
791         }
792         --i;
793       } else if (stack[i - 1].size <= stack[i].size) {
794         /* merge C into B */
795         goto mergeBC;
796       }
797     }
798     if (itemp == i) break;
799   }
800   *stacksize = i;
801   PetscFunctionReturn(PETSC_SUCCESS);
802 }
803 
804 /* March sequentially through the array building up a "run" of weakly increasing or strictly decreasing contiguous
805  elements. Decreasing runs are reversed by swapping. If the run is less than minrun, artificially extend it via either
806  binary insertion sort or regulat insertion sort */
807 static inline PetscErrorCode PetscTimSortBuildRun_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt n, PetscInt minrun, PetscInt runstart, PetscInt *runend)
808 {
809   const PetscInt re = PetscMin(runstart + minrun, n - 1);
810   PetscInt       ri = runstart;
811 
812   PetscFunctionBegin;
813   if (PetscUnlikely(runstart == n - 1)) {
814     *runend = runstart;
815     PetscFunctionReturn(PETSC_SUCCESS);
816   }
817   /* guess whether run is ascending or descending and tally up the longest consecutive run. essentially a coinflip for random data */
818   if ((*cmp)((arr) + (ri + 1) * size, (arr) + ri * size, ctx) < 0) {
819     ++ri;
820     while (ri < n - 1) {
821       if ((*cmp)((arr) + (ri + 1) * size, (arr) + ri * size, ctx) >= 0) break;
822       ++ri;
823     }
824     {
825       PetscInt lo = runstart, hi = ri;
826       for (; lo < hi; ++lo, --hi) COPYSWAPPY(arr + lo * size, arr + hi * size, tarr, size);
827     }
828   } else {
829     ++ri;
830     while (ri < n - 1) {
831       if ((*cmp)((arr) + (ri + 1) * size, (arr) + ri * size, ctx) < 0) break;
832       ++ri;
833     }
834   }
835   if (ri < re) {
836     /* the attempt failed, this section likely contains random data. If ri got close to minrun (within 50%) then we try
837      binary search */
838     if (ri - runstart <= minrun >> 1) {
839       ++MIN_GALLOP_GLOBAL; /* didn't get close hedge our bets against random data */
840       PetscCall(PetscInsertionSort_Private(arr, tarr, size, cmp, ctx, runstart, ri, re));
841     } else {
842       PetscCall(PetscBinaryInsertionSort_Private(arr, tarr, size, cmp, ctx, runstart, ri, re));
843     }
844     *runend = re;
845   } else *runend = ri;
846   PetscFunctionReturn(PETSC_SUCCESS);
847 }
848 
849 static inline PetscErrorCode PetscTimSortBuildRunWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt n, PetscInt minrun, PetscInt runstart, PetscInt *runend)
850 {
851   const PetscInt re = PetscMin(runstart + minrun, n - 1);
852   PetscInt       ri = runstart;
853 
854   PetscFunctionBegin;
855   if (PetscUnlikely(runstart == n - 1)) {
856     *runend = runstart;
857     PetscFunctionReturn(PETSC_SUCCESS);
858   }
859   /* guess whether run is ascending or descending and tally up the longest consecutive run. essentially a coinflip for random data */
860   if ((*cmp)((arr) + (ri + 1) * asize, arr + (ri * asize), ctx) < 0) {
861     ++ri;
862     while (ri < n - 1) {
863       if ((*cmp)((arr) + (ri + 1) * asize, (arr) + ri * asize, ctx) >= 0) break;
864       ++ri;
865     }
866     {
867       PetscInt lo = runstart, hi = ri;
868       for (; lo < hi; ++lo, --hi) COPYSWAPPY2(arr + lo * asize, arr + hi * asize, asize, barr + lo * bsize, barr + hi * bsize, bsize, atarr);
869     }
870   } else {
871     ++ri;
872     while (ri < n - 1) {
873       if ((*cmp)((arr) + (ri + 1) * asize, (arr) + ri * asize, ctx) < 0) break;
874       ++ri;
875     }
876   }
877   if (ri < re) {
878     /* the attempt failed, this section likely contains random data. If ri got close to minrun (within 50%) then we try
879      binary search */
880     if (ri - runstart <= minrun >> 1) {
881       ++MIN_GALLOP_GLOBAL; /* didn't get close hedge our bets against random data */
882       PetscCall(PetscInsertionSortWithArray_Private(arr, atarr, asize, barr, btarr, bsize, cmp, ctx, runstart, ri, re));
883     } else {
884       PetscCall(PetscBinaryInsertionSortWithArray_Private(arr, atarr, asize, barr, btarr, bsize, cmp, ctx, runstart, ri, re));
885     }
886     *runend = re;
887   } else *runend = ri;
888   PetscFunctionReturn(PETSC_SUCCESS);
889 }
890 
891 /*@C
892   PetscTimSort - Sorts an array in place in increasing order using Tim Peters adaptive sorting algorithm.
893 
894   Not Collective
895 
896   Input Parameters:
897 + n    - number of values
898 . arr  - array to be sorted
899 . size - size in bytes of the datatype held in arr
900 . cmp  - function pointer to comparison function
901 - ctx  - optional context to be passed to comparison function, NULL if not needed
902 
903   Output Parameter:
904 . arr - sorted array
905 
906   Notes:
907   Timsort makes the assumption that input data is already likely partially ordered, or that it contains contiguous
908   sections (termed 'runs') where the data is locally ordered (but not necessarily globally ordered). It therefore aims to
909   select slices of the array in such a way that resulting mergesorts operate on near perfectly length-balanced arrays. To
910   do so it repeatedly triggers attempts throughout to merge adjacent runs.
911 
912   Should one run continuously "win" a comparison the algorithm begins the "gallop" phase. It will aggressively search
913   the "winner" for the location of the "losers" next entry (and vice versa) to copy all preceding elements into place in
914   bulk. However if the data is truly unordered (as is the case with random data) the immense gains possible from these
915   searches are expected __not__ to repay their costs. While adjacent arrays are almost all nearly the same size, they
916   likely all contain similar data.
917 
918   Example Usage:
919   The comparison function must follow the `qsort()` comparison function paradigm, returning the sign of the difference
920   between its arguments. If left < right : return -1, if left == right : return 0, if left > right : return 1. The user
921   may also
922   change or reverse the order of the sort by flipping the above. Note that stability of the sort is only guaranteed if
923   the comparison function forms a valid trigraph. For example when sorting an array of type "my_type" in increasing
924   order
925 
926 .vb
927   int my_increasing_comparison_function(const void *left, const void *right, void *ctx) {
928     my_type l = *(my_type *) left, r = *(my_type *) right;
929     return (l < r) ? -1 : (l > r);
930   }
931 .ve
932 
933   Note the context is unused here but you may use it to pass and subsequently access whatever information required
934   inside the comparison function. The context pointer will unaltered except for any changes made inside the comparison function.
935   Then pass the function
936 .vb
937   PetscTimSort(n, arr, sizeof(arr[0]), my_increasing_comparison_function, ctx)
938 .ve
939 
940   Fortran Notes:
941   To use this from fortran you must write a comparison subroutine with 4 arguments which accepts left, right, ctx and
942   returns result. For example
943 .vb
944  subroutine CompareIntegers(left,right,ctx,result)
945    implicit none
946 
947    PetscInt,intent(in) :: left, right
948    type(UserCtx)       :: ctx
949    integer,intent(out) :: result
950 
951    if (left < right) then
952      result = -1
953    else if (left == right) then
954      result = 0
955    else
956      result = 1
957    end if
958    return
959  end subroutine CompareIntegers
960 .ve
961 
962   References:
963 . * - Tim Peters. https://bugs.python.org/file4451/timsort.txt
964 
965   Level: developer
966 
967 .seealso: `PetscTimSortWithArray()`, `PetscIntSortSemiOrdered()`, `PetscRealSortSemiOrdered()`, `PetscMPIIntSortSemiOrdered()`
968 @*/
969 PetscErrorCode PetscTimSort(PetscInt n, void *arr, size_t size, int (*cmp)(const void *, const void *, void *), void *ctx)
970 {
971   PetscInt           stacksize = 0, minrun, runstart = 0, runend = 0;
972   PetscTimSortStack  runstack[128];
973   PetscTimSortBuffer buff;
974   /* stacksize  = log_phi(n) = log_2(n)/log_2(phi), so 128 is enough for ~5.614e26 elements.
975    It is so unlikely that this limit is reached that this is __never__ checked for */
976 
977   PetscFunctionBegin;
978   /* Compute minrun. Minrun should be (32, 65) such that N/minrun
979    is a power of 2 or one plus a power of 2 */
980   {
981     PetscInt t = n, r = 0;
982     /* r becomes 1 if the least significant bits contain at least one off bit */
983     while (t >= 64) {
984       r |= t & 1;
985       t >>= 1;
986     }
987     minrun = t + r;
988   }
989   if (PetscDefined(USE_DEBUG)) {
990     PetscCall(PetscInfo(NULL, "minrun = %" PetscInt_FMT "\n", minrun));
991     if (n < 64) {
992       PetscCall(PetscInfo(NULL, "n %" PetscInt_FMT " < 64, consider using PetscSortInt() instead\n", n));
993     } else PetscCheck(!(minrun < 32) && !(minrun > 65), PETSC_COMM_SELF, PETSC_ERR_PLIB, "Calculated minrun %" PetscInt_FMT " not in range (32,65)", minrun);
994   }
995   PetscCall(PetscMalloc1((size_t)minrun * size, &buff.ptr));
996   buff.size         = (size_t)minrun * size;
997   buff.maxsize      = (size_t)n * size;
998   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
999   while (runstart < n) {
1000     /* Check if additional entries are at least partially ordered and build natural run */
1001     PetscCall(PetscTimSortBuildRun_Private((char *)arr, buff.ptr, size, cmp, ctx, n, minrun, runstart, &runend));
1002     runstack[stacksize].start = runstart;
1003     runstack[stacksize].size  = runend - runstart + 1;
1004     PetscCall(PetscTimSortMergeCollapse_Private((char *)arr, size, cmp, ctx, &buff, runstack, &stacksize));
1005     ++stacksize;
1006     runstart = runend + 1;
1007   }
1008   /* Have been inside while, so discard last stacksize++ */
1009   --stacksize;
1010   PetscCall(PetscTimSortForceCollapse_Private((char *)arr, size, cmp, ctx, &buff, runstack, stacksize));
1011   PetscCall(PetscFree(buff.ptr));
1012   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
1013   PetscFunctionReturn(PETSC_SUCCESS);
1014 }
1015 
1016 /*@C
1017   PetscTimSortWithArray - Sorts an array in place in increasing order using Tim Peters adaptive sorting algorithm and
1018   reorders a second array to match the first. The arrays need not be the same type.
1019 
1020   Not Collective
1021 
1022   Input Parameters:
1023 + n     - number of values
1024 . asize - size in bytes of the datatype held in arr
1025 . bsize - size in bytes of the datatype held in barr
1026 . cmp   - function pointer to comparison function
1027 - ctx   - optional context to be passed to comparison function, NULL if not needed
1028 
1029   Input/Output Parameters:
1030 + arr  - array to be sorted, on output it is sorted
1031 - barr - array to be reordered, on output it is reordered
1032 
1033   Notes:
1034   The arrays need not be of the same type, however barr MUST contain at least as many elements as arr and the two CANNOT
1035   overlap.
1036 
1037   Timsort makes the assumption that input data is already likely partially ordered, or that it contains contiguous
1038   sections (termed 'runs') where the data is locally ordered (but not necessarily globally ordered). It therefore aims
1039   to select slices of the array in such a way that resulting mergesorts operate on near perfectly length-balanced
1040   arrays. To do so it repeatedly triggers attempts throughout to merge adjacent runs.
1041 
1042   Should one run continuously "win" a comparison the algorithm begins the "gallop" phase. It will aggressively search
1043   the "winner" for the location of the "losers" next entry (and vice versa) to copy all preceding elements into place in
1044   bulk. However if the data is truly unordered (as is the case with random data) the immense gains possible from these
1045   searches are expected __not__ to repay their costs. While adjacent arrays are almost all nearly the same size, they
1046   likely all contain similar data.
1047 
1048   Example Usage:
1049   The comparison function must follow the `qsort()` comparison function paradigm, returning the sign of the difference
1050   between its arguments. If left < right : return -1, if left == right : return 0, if left > right : return 1. The user
1051   may also change or reverse the order of the sort by flipping the above. Note that stability of the sort is only
1052   guaranteed if the comparison function forms a valid trigraph. For example when sorting an array of type "my_type" in
1053   increasing order
1054 
1055 .vb
1056   int my_increasing_comparison_function(const void *left, const void *right, void *ctx) {
1057     my_type l = *(my_type *) left, r = *(my_type *) right;
1058     return (l < r) ? -1 : (l > r);
1059   }
1060 .ve
1061 
1062   Note the context is unused here but you may use it to pass and subsequently access whatever information required
1063   inside the comparison function. The context pointer will unaltered except for any changes made inside the comparison function.
1064   Then pass the function
1065 .vb
1066   PetscTimSortWithArray(n, arr, sizeof(arr[0]), barr, sizeof(barr[0]), my_increasing_comparison_function, ctx)
1067 .ve
1068 
1069   Fortran Notes:
1070   To use this from fortran you must write a comparison subroutine with 4 arguments which accepts left, right, ctx and
1071   returns result. For example
1072 .vb
1073  subroutine CompareIntegers(left,right,ctx,result)
1074    implicit none
1075 
1076    PetscInt,intent(in) :: left, right
1077    type(UserCtx)       :: ctx
1078    integer,intent(out) :: result
1079 
1080    if (left < right) then
1081      result = -1
1082    else if (left == right) then
1083      result = 0
1084    else
1085      result = 1
1086    end if
1087    return
1088  end subroutine CompareIntegers
1089 .ve
1090 
1091   References:
1092 . * - Tim Peters. https://bugs.python.org/file4451/timsort.txt
1093 
1094   Level: developer
1095 
1096 .seealso: `PetscTimSort()`, `PetscIntSortSemiOrderedWithArray()`, `PetscRealSortSemiOrderedWithArrayInt()`, `PetscMPIIntSortSemiOrderedWithArray()`
1097 @*/
1098 PetscErrorCode PetscTimSortWithArray(PetscInt n, void *arr, size_t asize, void *barr, size_t bsize, int (*cmp)(const void *, const void *, void *), void *ctx)
1099 {
1100   PetscInt           stacksize = 0, minrun, runstart = 0, runend = 0;
1101   PetscTimSortStack  runstack[128];
1102   PetscTimSortBuffer abuff, bbuff;
1103   /* stacksize  = log_phi(n) = log_2(n)/log_2(phi), so 128 is enough for ~5.614e26 elements.
1104    It is so unlikely that this limit is reached that this is __never__ checked for */
1105 
1106   PetscFunctionBegin;
1107   /* Compute minrun. Minrun should be (32, 65) such that N/minrun
1108    is a power of 2 or one plus a power of 2 */
1109   {
1110     PetscInt t = n, r = 0;
1111     /* r becomes 1 if the least significant bits contain at least one off bit */
1112     while (t >= 64) {
1113       r |= t & 1;
1114       t >>= 1;
1115     }
1116     minrun = t + r;
1117   }
1118   if (PetscDefined(USE_DEBUG)) {
1119     PetscCall(PetscInfo(NULL, "minrun = %" PetscInt_FMT "\n", minrun));
1120     PetscCheck(n < 64 || (minrun >= 32 && minrun <= 65), PETSC_COMM_SELF, PETSC_ERR_PLIB, "Calculated minrun %" PetscInt_FMT " not in range (32,65)", minrun);
1121   }
1122   PetscCall(PetscMalloc1((size_t)minrun * asize, &abuff.ptr));
1123   abuff.size    = (size_t)minrun * asize;
1124   abuff.maxsize = (size_t)n * asize;
1125   PetscCall(PetscMalloc1((size_t)minrun * bsize, &bbuff.ptr));
1126   bbuff.size        = (size_t)minrun * bsize;
1127   bbuff.maxsize     = (size_t)n * bsize;
1128   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
1129   while (runstart < n) {
1130     /* Check if additional entries are at least partially ordered and build natural run */
1131     PetscCall(PetscTimSortBuildRunWithArray_Private((char *)arr, abuff.ptr, asize, (char *)barr, bbuff.ptr, bsize, cmp, ctx, n, minrun, runstart, &runend));
1132     runstack[stacksize].start = runstart;
1133     runstack[stacksize].size  = runend - runstart + 1;
1134     PetscCall(PetscTimSortMergeCollapseWithArray_Private((char *)arr, asize, (char *)barr, bsize, cmp, ctx, &abuff, &bbuff, runstack, &stacksize));
1135     ++stacksize;
1136     runstart = runend + 1;
1137   }
1138   /* Have been inside while, so discard last stacksize++ */
1139   --stacksize;
1140   PetscCall(PetscTimSortForceCollapseWithArray_Private((char *)arr, asize, (char *)barr, bsize, cmp, ctx, &abuff, &bbuff, runstack, stacksize));
1141   PetscCall(PetscFree(abuff.ptr));
1142   PetscCall(PetscFree(bbuff.ptr));
1143   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
1144   PetscFunctionReturn(PETSC_SUCCESS);
1145 }
1146 
1147 /*@
1148   PetscIntSortSemiOrdered - Sorts an array of `PetscInt` in place in increasing order.
1149 
1150   Not Collective
1151 
1152   Input Parameters:
1153 + n   - number of values
1154 - arr - array of integers
1155 
1156   Output Parameter:
1157 . arr - sorted array of integers
1158 
1159   Notes:
1160   If the array is less than 64 entries long `PetscSortInt()` is automatically used.
1161 
1162   This function serves as an alternative to `PetscSortInt()`. While this function works for any array of integers it is
1163   significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1164   recommended that the user benchmark their code to see which routine is fastest.
1165 
1166   Level: intermediate
1167 
1168 .seealso: `PetscTimSort()`, `PetscSortInt()`, `PetscSortIntWithPermutation()`
1169 @*/
1170 PetscErrorCode PetscIntSortSemiOrdered(PetscInt n, PetscInt arr[])
1171 {
1172   PetscFunctionBegin;
1173   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1174   PetscValidIntPointer(arr, 2);
1175   if (n < 64) {
1176     PetscCall(PetscSortInt(n, arr));
1177   } else {
1178     PetscCall(PetscTimSort(n, arr, sizeof(PetscInt), Compare_PetscInt_Private, NULL));
1179   }
1180   PetscFunctionReturn(PETSC_SUCCESS);
1181 }
1182 
1183 /*@
1184   PetscIntSortSemiOrderedWithArray - Sorts an array of `PetscInt` in place in increasing order and reorders a second
1185   `PetscInt` array to match the first.
1186 
1187   Not Collective
1188 
1189   Input Parameter:
1190 . n - number of values
1191 
1192   Input/Output Parameters:
1193 + arr1 - array of integers to be sorted, modified on output
1194 - arr2 - array of integers to be reordered, modified on output
1195 
1196   Notes:
1197   The arrays CANNOT overlap.
1198 
1199   This function serves as an alternative to PetscSortIntWithArray(). While this function works for any array of integers it is
1200   significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1201   recommended that the user benchmark their code to see which routine is fastest.
1202 
1203   Level: intermediate
1204 
1205 .seealso: `PetscTimSortWithArray()`, `PetscSortIntWithArray()`, `PetscSortIntWithPermutation()`
1206 @*/
1207 PetscErrorCode PetscIntSortSemiOrderedWithArray(PetscInt n, PetscInt arr1[], PetscInt arr2[])
1208 {
1209   PetscFunctionBegin;
1210   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1211   PetscValidIntPointer(arr1, 2);
1212   PetscValidIntPointer(arr2, 3);
1213   /* cannot export out to PetscIntSortWithArray here since it isn't stable */
1214   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscInt), arr2, sizeof(PetscInt), Compare_PetscInt_Private, NULL));
1215   PetscFunctionReturn(PETSC_SUCCESS);
1216 }
1217 
1218 /*@
1219   PetscMPIIntSortSemiOrdered - Sorts an array of `PetscMPIInt` in place in increasing order.
1220 
1221   Not Collective
1222 
1223   Input Parameters:
1224 + n   - number of values
1225 - arr - array of `PetscMPIInt`
1226 
1227   Output Parameter:
1228 . arr - sorted array of integers
1229 
1230   Notes:
1231   If the array is less than 64 entries long `PetscSortMPIInt()` is automatically used.
1232 
1233   This function serves as an alternative to `PetscSortMPIInt()`. While this function works for any array of `PetscMPIInt` it is
1234   significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1235   recommended that the user benchmark their code to see which routine is fastest.
1236 
1237   Level: intermediate
1238 
1239 .seealso: `PetscTimSort()`, `PetscSortMPIInt()`
1240 @*/
1241 PetscErrorCode PetscMPIIntSortSemiOrdered(PetscInt n, PetscMPIInt arr[])
1242 {
1243   PetscFunctionBegin;
1244   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1245   PetscValidIntPointer(arr, 2);
1246   if (n < 64) {
1247     PetscCall(PetscSortMPIInt(n, arr));
1248   } else {
1249     PetscCall(PetscTimSort(n, arr, sizeof(PetscMPIInt), Compare_PetscMPIInt_Private, NULL));
1250   }
1251   PetscFunctionReturn(PETSC_SUCCESS);
1252 }
1253 
1254 /*@
1255   PetscMPIIntSortSemiOrderedWithArray - Sorts an array of `PetscMPIInt` in place in increasing order and reorders a second `PetscMPIInt`
1256   array to match the first.
1257 
1258   Not Collective
1259 
1260   Input Parameter:
1261 . n - number of values
1262 
1263   Input/Output Parameters:
1264 + arr1 - array of integers to be sorted, modified on output
1265 - arr2 - array of integers to be reordered, modified on output
1266 
1267   Notes:
1268   The arrays CANNOT overlap.
1269 
1270   This function serves as an alternative to `PetscSortMPIIntWithArray()`. While this function works for any array of integers it is
1271   significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1272   recommended that the user benchmark their code to see which routine is fastest.
1273 
1274   Level: intermediate
1275 
1276 .seealso: `PetscTimSortWithArray()`, `PetscSortMPIIntWithArray()`, `PetscSortMPIIntWithPermutation()`
1277 @*/
1278 PetscErrorCode PetscMPIIntSortSemiOrderedWithArray(PetscInt n, PetscMPIInt arr1[], PetscMPIInt arr2[])
1279 {
1280   PetscFunctionBegin;
1281   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1282   PetscValidIntPointer(arr1, 2);
1283   PetscValidIntPointer(arr2, 3);
1284   /* cannot export out to PetscMPIIntSortWithArray here since it isn't stable */
1285   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscMPIInt), arr2, sizeof(PetscMPIInt), Compare_PetscMPIInt_Private, NULL));
1286   PetscFunctionReturn(PETSC_SUCCESS);
1287 }
1288 
1289 /*@
1290   PetscRealSortSemiOrdered - Sorts an array of `PetscReal` in place in increasing order.
1291 
1292   Not Collective
1293 
1294   Input Parameters:
1295 + n   - number of values
1296 - arr - array of `PetscReal`
1297 
1298   Output Parameter:
1299 . arr - sorted array of integers
1300 
1301   Notes:
1302   If the array is less than 64 entries long `PetscSortReal()` is automatically used.
1303 
1304   This function serves as an alternative to `PetscSortReal()`. While this function works for any array of `PetscReal` it is
1305   significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1306   recommended that the user benchmark their code to see which routine is fastest.
1307 
1308   Level: intermediate
1309 
1310 .seealso: `PetscTimSort()`, `PetscSortReal()`, `PetscSortRealWithPermutation()`
1311 @*/
1312 PetscErrorCode PetscRealSortSemiOrdered(PetscInt n, PetscReal arr[])
1313 {
1314   PetscFunctionBegin;
1315   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1316   PetscValidRealPointer(arr, 2);
1317   if (n < 64) {
1318     PetscCall(PetscSortReal(n, arr));
1319   } else {
1320     PetscCall(PetscTimSort(n, arr, sizeof(PetscReal), Compare_PetscReal_Private, NULL));
1321   }
1322   PetscFunctionReturn(PETSC_SUCCESS);
1323 }
1324 
1325 /*@
1326   PetscRealSortSemiOrderedWithArrayInt - Sorts an array of `PetscReal` in place in increasing order and reorders a second
1327   array of `PetscInt` to match the first.
1328 
1329   Not Collective
1330 
1331   Input Parameter:
1332 . n - number of values
1333 
1334   Input/Output Parameters:
1335 + arr1 - array of `PetscReal` to be sorted, modified on output
1336 - arr2 - array of `PetscInt` to be reordered, modified on output
1337 
1338   Notes:
1339   This function serves as an alternative to `PetscSortRealWithArray()`. While this function works for any array of `PetscReal` it is
1340   significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1341   recommended that the user benchmark their code to see which routine is fastest.
1342 
1343   Level: intermediate
1344 
1345 .seealso: `PetscTimSortWithArray()`, `PetscSortRealWithArrayInt()`, `PetscSortRealWithPermutation()`
1346 @*/
1347 PetscErrorCode PetscRealSortSemiOrderedWithArrayInt(PetscInt n, PetscReal arr1[], PetscInt arr2[])
1348 {
1349   PetscFunctionBegin;
1350   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1351   PetscValidRealPointer(arr1, 2);
1352   PetscValidIntPointer(arr2, 3);
1353   /* cannot export out to PetscRealSortWithArrayInt here since it isn't stable */
1354   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscReal), arr2, sizeof(PetscInt), Compare_PetscReal_Private, NULL));
1355   PetscFunctionReturn(PETSC_SUCCESS);
1356 }
1357