xref: /petsc/src/sys/utils/sortso.c (revision 9140fee14acbea959c6ed74f4836a1a89f708038)
1 #include <petscsys.h> /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 static inline int Compare_PetscMPIInt_Private(const void *left, const void *right, PETSC_UNUSED void *ctx)
5 {
6   PetscMPIInt l = *(PetscMPIInt *)left, r = *(PetscMPIInt *)right;
7   return (l < r) ? -1 : (l > r);
8 }
9 
10 static inline int Compare_PetscInt_Private(const void *left, const void *right, PETSC_UNUSED void *ctx)
11 {
12   PetscInt l = *(PetscInt *)left, r = *(PetscInt *)right;
13   return (l < r) ? -1 : (l > r);
14 }
15 
16 static inline int Compare_PetscReal_Private(const void *left, const void *right, PETSC_UNUSED void *ctx)
17 {
18   PetscReal l = *(PetscReal *)left, r = *(PetscReal *)right;
19   return (l < r) ? -1 : (l > r);
20 }
21 
22 #define MIN_GALLOP_CONST_GLOBAL 8
23 static PetscInt MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
24 typedef int (*CompFunc)(const void *, const void *, void *);
25 
26 /* Mostly to force clang uses the builtins, but can't hurt to have gcc compilers also do the same */
27 #if !defined(PETSC_USE_DEBUG) && defined(__GNUC__)
28 static inline void COPYSWAPPY(char *a, char *b, char *t, size_t size)
29 {
30   __builtin_memcpy(t, b, size);
31   __builtin_memmove(b, a, size);
32   __builtin_memcpy(a, t, size);
33   return;
34 }
35 
36 static inline void COPYSWAPPY2(char *al, char *ar, size_t asize, char *bl, char *br, size_t bsize, char *t)
37 {
38   __builtin_memcpy(t, ar, asize);
39   __builtin_memmove(ar, al, asize);
40   __builtin_memcpy(al, t, asize);
41   __builtin_memcpy(t, br, bsize);
42   __builtin_memmove(br, bl, bsize);
43   __builtin_memcpy(bl, t, bsize);
44   return;
45 }
46 
47 static inline void Petsc_memcpy(char *dest, const char *src, size_t size)
48 {
49   __builtin_memcpy(dest, src, size);
50   return;
51 }
52 
53 static inline void Petsc_memcpy2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
54 {
55   __builtin_memcpy(adest, asrc, asize);
56   __builtin_memcpy(bdest, bsrc, bsize);
57   return;
58 }
59 
60 static inline void Petsc_memmove(char *dest, const char *src, size_t size)
61 {
62   __builtin_memmove(dest, src, size);
63   return;
64 }
65 
66 static inline void Petsc_memmove2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
67 {
68   __builtin_memmove(adest, asrc, asize);
69   __builtin_memmove(bdest, bsrc, bsize);
70   return;
71 }
72 #else
73 static inline void COPYSWAPPY(char *a, char *b, char *t, size_t size)
74 {
75   PetscFunctionBegin;
76   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(t, b, size));
77   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(b, a, size));
78   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(a, t, size));
79   PetscFunctionReturnVoid();
80 }
81 
82 static inline void COPYSWAPPY2(char *al, char *ar, size_t asize, char *bl, char *br, size_t bsize, char *t)
83 {
84   PetscFunctionBegin;
85   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(t, ar, asize));
86   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(ar, al, asize));
87   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(al, t, asize));
88   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(t, br, bsize));
89   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(br, bl, bsize));
90   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(bl, t, bsize));
91   PetscFunctionReturnVoid();
92 }
93 
94 static inline void Petsc_memcpy(char *dest, const char *src, size_t size)
95 {
96   PetscFunctionBegin;
97   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(dest, src, size));
98   PetscFunctionReturnVoid();
99 }
100 
101 static inline void Petsc_memcpy2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
102 {
103   PetscFunctionBegin;
104   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(adest, asrc, asize));
105   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(bdest, bsrc, bsize));
106   PetscFunctionReturnVoid();
107 }
108 
109 static inline void Petsc_memmove(char *dest, const char *src, size_t size)
110 {
111   PetscFunctionBegin;
112   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(dest, src, size));
113   PetscFunctionReturnVoid();
114 }
115 
116 static inline void Petsc_memmove2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
117 {
118   PetscFunctionBegin;
119   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(adest, asrc, asize));
120   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(bdest, bsrc, bsize));
121   PetscFunctionReturnVoid();
122 }
123 #endif
124 
125 /* Start left look right. Looking for e.g. B[0] in A or mergelo. l inclusive, r inclusive. Returns first m such that arr[m] >
126  x. Output also inclusive.
127 
128  NOTE: Both gallopsearch functions CANNOT distinguish between inserting AFTER the entire array vs inserting at entry n!! For example for an array:
129 
130    {0,1,2,3,4,5}
131 
132    when looking to insert "5" this routine will return *m = 6, but when looking to insert "6" it will ALSO return *m = 6.
133   */
134 static inline PetscErrorCode PetscGallopSearchLeft_Private(const char *arr, size_t size, CompFunc cmp, void *ctx, PetscInt l, PetscInt r, const char *x, PetscInt *m)
135 {
136   PetscInt last = l, k = 1, mid, cur = l + 1;
137 
138   PetscFunctionBegin;
139   *m = l;
140   PetscAssert(r >= l, PETSC_COMM_SELF, PETSC_ERR_PLIB, "r %" PetscInt_FMT " < l %" PetscInt_FMT " in PetscGallopSearchLeft", r, l);
141   if ((*cmp)(x, arr + r * size, ctx) >= 0) {
142     *m = r;
143     PetscFunctionReturn(PETSC_SUCCESS);
144   }
145   if ((*cmp)(x, (arr) + l * size, ctx) < 0 || PetscUnlikely(!(r - l))) PetscFunctionReturn(PETSC_SUCCESS);
146   while (PETSC_TRUE) {
147     if (PetscUnlikely(cur > r)) {
148       cur = r;
149       break;
150     }
151     if ((*cmp)(x, (arr) + cur * size, ctx) < 0) break;
152     last = cur;
153     cur += (k <<= 1) + 1;
154     ++k;
155   }
156   /* standard binary search but take last 0 mid 0 cur 1 into account*/
157   while (cur > last + 1) {
158     mid = last + ((cur - last) >> 1);
159     if ((*cmp)(x, (arr) + mid * size, ctx) < 0) {
160       cur = mid;
161     } else {
162       last = mid;
163     }
164   }
165   *m = cur;
166   PetscFunctionReturn(PETSC_SUCCESS);
167 }
168 
169 /* Start right look left. Looking for e.g. A[-1] in B or mergehi. l inclusive, r inclusive. Returns last m such that arr[m]
170  < x. Output also inclusive */
171 static inline PetscErrorCode PetscGallopSearchRight_Private(const char *arr, size_t size, CompFunc cmp, void *ctx, PetscInt l, PetscInt r, const char *x, PetscInt *m)
172 {
173   PetscInt last = r, k = 1, mid, cur = r - 1;
174 
175   PetscFunctionBegin;
176   *m = r;
177   PetscAssert(r >= l, PETSC_COMM_SELF, PETSC_ERR_PLIB, "r %" PetscInt_FMT " < l %" PetscInt_FMT " in PetscGallopSearchRight", r, l);
178   if ((*cmp)(x, arr + l * size, ctx) <= 0) {
179     *m = l;
180     PetscFunctionReturn(PETSC_SUCCESS);
181   }
182   if ((*cmp)(x, (arr) + r * size, ctx) > 0 || PetscUnlikely(!(r - l))) PetscFunctionReturn(PETSC_SUCCESS);
183   while (PETSC_TRUE) {
184     if (PetscUnlikely(cur < l)) {
185       cur = l;
186       break;
187     }
188     if ((*cmp)(x, (arr) + cur * size, ctx) > 0) break;
189     last = cur;
190     cur -= (k <<= 1) + 1;
191     ++k;
192   }
193   /* standard binary search but take last r-1 mid r-1 cur r-2 into account*/
194   while (last > cur + 1) {
195     mid = last - ((last - cur) >> 1);
196     if ((*cmp)(x, (arr) + mid * size, ctx) > 0) {
197       cur = mid;
198     } else {
199       last = mid;
200     }
201   }
202   *m = cur;
203   PetscFunctionReturn(PETSC_SUCCESS);
204 }
205 
206 /* Mergesort where size of left half <= size of right half, so mergesort is done left to right. Arr should be pointer to
207  complete array, left is first index of left array, mid is first index of right array, right is last index of right
208  array */
209 static inline PetscErrorCode PetscTimSortMergeLo_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
210 {
211   PetscInt       i = 0, j = mid, k = left, gallopleft = 0, gallopright = 0;
212   const PetscInt llen = mid - left;
213 
214   PetscFunctionBegin;
215   Petsc_memcpy(tarr, arr + (left * size), llen * size);
216   while ((i < llen) && (j <= right)) {
217     if ((*cmp)(tarr + (i * size), arr + (j * size), ctx) < 0) {
218       Petsc_memcpy(arr + (k * size), tarr + (i * size), size);
219       ++k;
220       gallopright = 0;
221       if (++i < llen && ++gallopleft >= MIN_GALLOP_GLOBAL) {
222         PetscInt l1, l2, diff1, diff2;
223         do {
224           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
225           /* search temp for right[j], can move up to that of temp into arr immediately */
226           PetscCall(PetscGallopSearchLeft_Private(tarr, size, cmp, ctx, i, llen - 1, arr + (j * size), &l1));
227           diff1 = l1 - i;
228           Petsc_memcpy(arr + (k * size), tarr + (i * size), diff1 * size);
229           k += diff1;
230           i = l1;
231           /* search right for temp[i], can move up to that many of right into arr */
232           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, j, right, tarr + (i * size), &l2));
233           diff2 = l2 - j;
234           Petsc_memmove((arr) + k * size, (arr) + j * size, diff2 * size);
235           k += diff2;
236           j = l2;
237           if (i >= llen || j > right) break;
238         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
239         ++MIN_GALLOP_GLOBAL;
240       }
241     } else {
242       Petsc_memmove(arr + (k * size), arr + (j * size), size);
243       ++k;
244       gallopleft = 0;
245       if (++j <= right && ++gallopright >= MIN_GALLOP_GLOBAL) {
246         PetscInt l1, l2, diff1, diff2;
247         do {
248           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
249           /* search right for temp[i], can move up to that many of right into arr */
250           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, j, right, tarr + (i * size), &l2));
251           diff2 = l2 - j;
252           Petsc_memmove(arr + (k * size), arr + (j * size), diff2 * size);
253           k += diff2;
254           j = l2;
255           /* search temp for right[j], can copy up to that of temp into arr immediately */
256           PetscCall(PetscGallopSearchLeft_Private(tarr, size, cmp, ctx, i, llen - 1, arr + (j * size), &l1));
257           diff1 = l1 - i;
258           Petsc_memcpy(arr + (k * size), tarr + (i * size), diff1 * size);
259           k += diff1;
260           i = l1;
261           if (i >= llen || j > right) break;
262         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
263         ++MIN_GALLOP_GLOBAL;
264       }
265     }
266   }
267   if (i < llen) Petsc_memcpy(arr + (k * size), tarr + (i * size), (llen - i) * size);
268   PetscFunctionReturn(PETSC_SUCCESS);
269 }
270 
271 static inline PetscErrorCode PetscTimSortMergeLoWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
272 {
273   PetscInt       i = 0, j = mid, k = left, gallopleft = 0, gallopright = 0;
274   const PetscInt llen = mid - left;
275 
276   PetscFunctionBegin;
277   Petsc_memcpy2(atarr, arr + (left * asize), llen * asize, btarr, barr + (left * bsize), llen * bsize);
278   while ((i < llen) && (j <= right)) {
279     if ((*cmp)(atarr + (i * asize), arr + (j * asize), ctx) < 0) {
280       Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), asize, barr + (k * bsize), btarr + (i * bsize), bsize);
281       ++k;
282       gallopright = 0;
283       if (++i < llen && ++gallopleft >= MIN_GALLOP_GLOBAL) {
284         PetscInt l1, l2, diff1, diff2;
285         do {
286           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
287           /* search temp for right[j], can move up to that of temp into arr immediately */
288           PetscCall(PetscGallopSearchLeft_Private(atarr, asize, cmp, ctx, i, llen - 1, arr + (j * asize), &l1));
289           diff1 = l1 - i;
290           Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), diff1 * asize, barr + (k * bsize), btarr + (i * bsize), diff1 * bsize);
291           k += diff1;
292           i = l1;
293           /* search right for temp[i], can move up to that many of right into arr */
294           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, j, right, atarr + (i * asize), &l2));
295           diff2 = l2 - j;
296           Petsc_memmove2(arr + (k * asize), arr + (j * asize), diff2 * asize, barr + (k * bsize), barr + (j * bsize), diff2 * bsize);
297           k += diff2;
298           j = l2;
299           if (i >= llen || j > right) break;
300         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
301         ++MIN_GALLOP_GLOBAL;
302       }
303     } else {
304       Petsc_memmove2(arr + (k * asize), arr + (j * asize), asize, barr + (k * bsize), barr + (j * bsize), bsize);
305       ++k;
306       gallopleft = 0;
307       if (++j <= right && ++gallopright >= MIN_GALLOP_GLOBAL) {
308         PetscInt l1, l2, diff1, diff2;
309         do {
310           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
311           /* search right for temp[i], can move up to that many of right into arr */
312           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, j, right, atarr + (i * asize), &l2));
313           diff2 = l2 - j;
314           Petsc_memmove2(arr + (k * asize), arr + (j * asize), diff2 * asize, barr + (k * bsize), barr + (j * bsize), diff2 * bsize);
315           k += diff2;
316           j = l2;
317           /* search temp for right[j], can copy up to that of temp into arr immediately */
318           PetscCall(PetscGallopSearchLeft_Private(atarr, asize, cmp, ctx, i, llen - 1, arr + (j * asize), &l1));
319           diff1 = l1 - i;
320           Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), diff1 * asize, barr + (k * bsize), btarr + (i * bsize), diff1 * bsize);
321           k += diff1;
322           i = l1;
323           if (i >= llen || j > right) break;
324         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
325         ++MIN_GALLOP_GLOBAL;
326       }
327     }
328   }
329   if (i < llen) Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), (llen - i) * asize, barr + (k * bsize), btarr + (i * bsize), (llen - i) * bsize);
330   PetscFunctionReturn(PETSC_SUCCESS);
331 }
332 
333 /* Mergesort where size of right half < size of left half, so mergesort is done right to left. Arr should be pointer to
334  complete array, left is first index of left array, mid is first index of right array, right is last index of right
335  array */
336 static inline PetscErrorCode PetscTimSortMergeHi_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
337 {
338   PetscInt       i = right - mid, j = mid - 1, k = right, gallopleft = 0, gallopright = 0;
339   const PetscInt rlen = right - mid + 1;
340 
341   PetscFunctionBegin;
342   Petsc_memcpy(tarr, (arr) + mid * size, rlen * size);
343   while ((i >= 0) && (j >= left)) {
344     if ((*cmp)((tarr) + i * size, (arr) + j * size, ctx) > 0) {
345       Petsc_memcpy((arr) + k * size, (tarr) + i * size, size);
346       --k;
347       gallopleft = 0;
348       if (--i >= 0 && ++gallopright >= MIN_GALLOP_GLOBAL) {
349         PetscInt l1, l2, diff1, diff2;
350         do {
351           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
352           /* search temp for left[j], can copy up to that many of temp into arr */
353           PetscCall(PetscGallopSearchRight_Private(tarr, size, cmp, ctx, 0, i, (arr) + j * size, &l1));
354           diff1 = i - l1;
355           Petsc_memcpy((arr) + (k - diff1 + 1) * size, (tarr) + (l1 + 1) * size, diff1 * size);
356           k -= diff1;
357           i = l1;
358           /* search left for temp[i], can move up to that many of left up arr */
359           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, left, j, (tarr) + i * size, &l2));
360           diff2 = j - l2;
361           Petsc_memmove((arr) + (k - diff2 + 1) * size, (arr) + (l2 + 1) * size, diff2 * size);
362           k -= diff2;
363           j = l2;
364           if (i < 0 || j < left) break;
365         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
366         ++MIN_GALLOP_GLOBAL;
367       }
368     } else {
369       Petsc_memmove((arr) + k * size, (arr) + j * size, size);
370       --k;
371       gallopright = 0;
372       if (--j >= left && ++gallopleft >= MIN_GALLOP_GLOBAL) {
373         PetscInt l1, l2, diff1, diff2;
374         do {
375           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
376           /* search left for temp[i], can move up to that many of left up arr */
377           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, left, j, (tarr) + i * size, &l2));
378           diff2 = j - l2;
379           Petsc_memmove((arr) + (k - diff2 + 1) * size, (arr) + (l2 + 1) * size, diff2 * size);
380           k -= diff2;
381           j = l2;
382           /* search temp for left[j], can copy up to that many of temp into arr */
383           PetscCall(PetscGallopSearchRight_Private(tarr, size, cmp, ctx, 0, i, (arr) + j * size, &l1));
384           diff1 = i - l1;
385           Petsc_memcpy((arr) + (k - diff1 + 1) * size, (tarr) + (l1 + 1) * size, diff1 * size);
386           k -= diff1;
387           i = l1;
388           if (i < 0 || j < left) break;
389         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
390         ++MIN_GALLOP_GLOBAL;
391       }
392     }
393   }
394   if (i >= 0) Petsc_memcpy((arr) + left * size, tarr, (i + 1) * size);
395   PetscFunctionReturn(PETSC_SUCCESS);
396 }
397 
398 static inline PetscErrorCode PetscTimSortMergeHiWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
399 {
400   PetscInt       i = right - mid, j = mid - 1, k = right, gallopleft = 0, gallopright = 0;
401   const PetscInt rlen = right - mid + 1;
402 
403   PetscFunctionBegin;
404   Petsc_memcpy2(atarr, (arr) + mid * asize, rlen * asize, btarr, (barr) + mid * bsize, rlen * bsize);
405   while ((i >= 0) && (j >= left)) {
406     if ((*cmp)((atarr) + i * asize, (arr) + j * asize, ctx) > 0) {
407       Petsc_memcpy2((arr) + k * asize, (atarr) + i * asize, asize, (barr) + k * bsize, (btarr) + i * bsize, bsize);
408       --k;
409       gallopleft = 0;
410       if (--i >= 0 && ++gallopright >= MIN_GALLOP_GLOBAL) {
411         PetscInt l1, l2, diff1, diff2;
412         do {
413           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
414           /* search temp for left[j], can copy up to that many of temp into arr */
415           PetscCall(PetscGallopSearchRight_Private(atarr, asize, cmp, ctx, 0, i, (arr) + j * asize, &l1));
416           diff1 = i - l1;
417           Petsc_memcpy2((arr) + (k - diff1 + 1) * asize, (atarr) + (l1 + 1) * asize, diff1 * asize, (barr) + (k - diff1 + 1) * bsize, (btarr) + (l1 + 1) * bsize, diff1 * bsize);
418           k -= diff1;
419           i = l1;
420           /* search left for temp[i], can move up to that many of left up arr */
421           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, left, j, (atarr) + i * asize, &l2));
422           diff2 = j - l2;
423           Petsc_memmove2((arr) + (k - diff2 + 1) * asize, (arr) + (l2 + 1) * asize, diff2 * asize, (barr) + (k - diff2 + 1) * bsize, (barr) + (l2 + 1) * bsize, diff2 * bsize);
424           k -= diff2;
425           j = l2;
426           if (i < 0 || j < left) break;
427         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
428         ++MIN_GALLOP_GLOBAL;
429       }
430     } else {
431       Petsc_memmove2((arr) + k * asize, (arr) + j * asize, asize, (barr) + k * bsize, (barr) + j * bsize, bsize);
432       --k;
433       gallopright = 0;
434       if (--j >= left && ++gallopleft >= MIN_GALLOP_GLOBAL) {
435         PetscInt l1, l2, diff1, diff2;
436         do {
437           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
438           /* search left for temp[i], can move up to that many of left up arr */
439           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, left, j, (atarr) + i * asize, &l2));
440           diff2 = j - l2;
441           Petsc_memmove2((arr) + (k - diff2 + 1) * asize, (arr) + (l2 + 1) * asize, diff2 * asize, (barr) + (k - diff2 + 1) * bsize, (barr) + (l2 + 1) * bsize, diff2 * bsize);
442           k -= diff2;
443           j = l2;
444           /* search temp for left[j], can copy up to that many of temp into arr */
445           PetscCall(PetscGallopSearchRight_Private(atarr, asize, cmp, ctx, 0, i, (arr) + j * asize, &l1));
446           diff1 = i - l1;
447           Petsc_memcpy2((arr) + (k - diff1 + 1) * asize, (atarr) + (l1 + 1) * asize, diff1 * asize, (barr) + (k - diff1 + 1) * bsize, (btarr) + (l1 + 1) * bsize, diff1 * bsize);
448           k -= diff1;
449           i = l1;
450           if (i < 0 || j < left) break;
451         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
452         ++MIN_GALLOP_GLOBAL;
453       }
454     }
455   }
456   if (i >= 0) Petsc_memcpy2((arr) + left * asize, atarr, (i + 1) * asize, (barr) + left * bsize, btarr, (i + 1) * bsize);
457   PetscFunctionReturn(PETSC_SUCCESS);
458 }
459 
460 /* Left is inclusive lower bound of array slice, start is start location of unsorted section, right is inclusive upper
461  bound of array slice. If unsure of where unsorted section starts or if entire length is unsorted pass start = left */
462 static inline PetscErrorCode PetscInsertionSort_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
463 {
464   PetscInt i = start == left ? start + 1 : start;
465 
466   PetscFunctionBegin;
467   for (; i <= right; ++i) {
468     PetscInt j = i - 1;
469     Petsc_memcpy(tarr, arr + (i * size), size);
470     while ((j >= left) && ((*cmp)(tarr, (arr) + j * size, ctx) < 0)) {
471       COPYSWAPPY(arr + (j + 1) * size, arr + j * size, tarr + size, size);
472       --j;
473     }
474     Petsc_memcpy((arr) + (j + 1) * size, tarr, size);
475   }
476   PetscFunctionReturn(PETSC_SUCCESS);
477 }
478 
479 static inline PetscErrorCode PetscInsertionSortWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
480 {
481   PetscInt i = start == left ? start + 1 : start;
482 
483   PetscFunctionBegin;
484   for (; i <= right; ++i) {
485     PetscInt j = i - 1;
486     Petsc_memcpy2(atarr, arr + (i * asize), asize, btarr, barr + (i * bsize), bsize);
487     while ((j >= left) && ((*cmp)(atarr, arr + (j * asize), ctx) < 0)) {
488       COPYSWAPPY2(arr + (j + 1) * asize, arr + j * asize, asize, barr + (j + 1) * bsize, barr + j * bsize, bsize, atarr + asize);
489       --j;
490     }
491     Petsc_memcpy2(arr + (j + 1) * asize, atarr, asize, barr + (j + 1) * bsize, btarr, bsize);
492   }
493   PetscFunctionReturn(PETSC_SUCCESS);
494 }
495 
496 /* See PetscInsertionSort_Private */
497 static inline PetscErrorCode PetscBinaryInsertionSort_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
498 {
499   PetscInt i = start == left ? start + 1 : start;
500 
501   PetscFunctionBegin;
502   for (; i <= right; ++i) {
503     PetscInt l = left, r = i;
504     Petsc_memcpy(tarr, arr + (i * size), size);
505     do {
506       const PetscInt m = l + ((r - l) >> 1);
507       if ((*cmp)(tarr, arr + (m * size), ctx) < 0) {
508         r = m;
509       } else {
510         l = m + 1;
511       }
512     } while (l < r);
513     Petsc_memmove(arr + ((l + 1) * size), arr + (l * size), (i - l) * size);
514     Petsc_memcpy(arr + (l * size), tarr, size);
515   }
516   PetscFunctionReturn(PETSC_SUCCESS);
517 }
518 
519 static inline PetscErrorCode PetscBinaryInsertionSortWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
520 {
521   PetscInt i = start == left ? start + 1 : start;
522 
523   PetscFunctionBegin;
524   for (; i <= right; ++i) {
525     PetscInt l = left, r = i;
526     Petsc_memcpy2(atarr, arr + (i * asize), asize, btarr, barr + (i * bsize), bsize);
527     do {
528       const PetscInt m = l + ((r - l) >> 1);
529       if ((*cmp)(atarr, arr + (m * asize), ctx) < 0) {
530         r = m;
531       } else {
532         l = m + 1;
533       }
534     } while (l < r);
535     Petsc_memmove2(arr + ((l + 1) * asize), arr + (l * asize), (i - l) * asize, barr + ((l + 1) * bsize), barr + (l * bsize), (i - l) * bsize);
536     Petsc_memcpy2(arr + (l * asize), atarr, asize, barr + (l * bsize), btarr, bsize);
537   }
538   PetscFunctionReturn(PETSC_SUCCESS);
539 }
540 
541 typedef struct {
542   PetscInt size;
543   PetscInt start;
544 #if defined(__CRAY_AARCH64) /* segfaults with Cray compilers for aarch64 on a64FX */
545 } PetscTimSortStack;
546 #else
547 } PetscTimSortStack PETSC_ATTRIBUTEALIGNED(2 * sizeof(PetscInt));
548 #endif
549 
550 typedef struct {
551   char *ptr PETSC_ATTRIBUTEALIGNED(PETSC_MEMALIGN);
552   size_t    size;
553   size_t    maxsize;
554 } PetscTimSortBuffer;
555 
556 static inline PetscErrorCode PetscTimSortResizeBuffer_Private(PetscTimSortBuffer *buff, size_t newSize)
557 {
558   PetscFunctionBegin;
559   if (PetscLikely(newSize <= buff->size)) PetscFunctionReturn(PETSC_SUCCESS);
560   {
561     /* Can't be larger than n, there is merit to simply allocating buff to n to begin with */
562     size_t newMax = PetscMin(newSize * newSize, buff->maxsize);
563     PetscCall(PetscFree(buff->ptr));
564     PetscCall(PetscMalloc1(newMax, &buff->ptr));
565     buff->size = newMax;
566   }
567   PetscFunctionReturn(PETSC_SUCCESS);
568 }
569 
570 static inline PetscErrorCode PetscTimSortForceCollapse_Private(char *arr, size_t size, CompFunc cmp, void *ctx, PetscTimSortBuffer *buff, PetscTimSortStack *stack, PetscInt stacksize)
571 {
572   PetscFunctionBegin;
573   for (; stacksize; --stacksize) {
574     /* A = stack[i-1], B = stack[i], if A[-1] <= B[0] means sorted */
575     if ((*cmp)(arr + (stack[stacksize].start - 1) * size, arr + (stack[stacksize].start) * size, ctx) > 0) {
576       PetscInt l, m = stack[stacksize].start, r;
577       /* Search A for B[0] insertion */
578       PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[stacksize - 1].start, stack[stacksize].start - 1, (arr) + (stack[stacksize].start) * size, &l));
579       /* Search B for A[-1] insertion */
580       PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[stacksize].start, stack[stacksize].start + stack[stacksize].size - 1, (arr) + (stack[stacksize].start - 1) * size, &r));
581       if (m - l <= r - m) {
582         PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
583         PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
584       } else {
585         PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
586         PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
587       }
588     }
589     /* Update A with merge */
590     stack[stacksize - 1].size += stack[stacksize].size;
591   }
592   PetscFunctionReturn(PETSC_SUCCESS);
593 }
594 
595 static inline PetscErrorCode PetscTimSortForceCollapseWithArray_Private(char *arr, size_t asize, char *barr, size_t bsize, CompFunc cmp, void *ctx, PetscTimSortBuffer *abuff, PetscTimSortBuffer *bbuff, PetscTimSortStack *stack, PetscInt stacksize)
596 {
597   PetscFunctionBegin;
598   for (; stacksize; --stacksize) {
599     /* A = stack[i-1], B = stack[i], if A[-1] <= B[0] means sorted */
600     if ((*cmp)(arr + (stack[stacksize].start - 1) * asize, arr + (stack[stacksize].start) * asize, ctx) > 0) {
601       PetscInt l, m = stack[stacksize].start, r;
602       /* Search A for B[0] insertion */
603       PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[stacksize - 1].start, stack[stacksize].start - 1, (arr) + (stack[stacksize].start) * asize, &l));
604       /* Search B for A[-1] insertion */
605       PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[stacksize].start, stack[stacksize].start + stack[stacksize].size - 1, (arr) + (stack[stacksize].start - 1) * asize, &r));
606       if (m - l <= r - m) {
607         PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
608         PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
609         PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
610       } else {
611         PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
612         PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
613         PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
614       }
615     }
616     /* Update A with merge */
617     stack[stacksize - 1].size += stack[stacksize].size;
618   }
619   PetscFunctionReturn(PETSC_SUCCESS);
620 }
621 
622 static inline PetscErrorCode PetscTimSortMergeCollapse_Private(char *arr, size_t size, CompFunc cmp, void *ctx, PetscTimSortBuffer *buff, PetscTimSortStack *stack, PetscInt *stacksize)
623 {
624   PetscInt i = *stacksize;
625 
626   PetscFunctionBegin;
627   while (i) {
628     PetscInt l, m, r, itemp = i;
629 
630     if (i == 1) {
631       /* A = stack[i-1], B = stack[i] */
632       if (stack[i - 1].size < stack[i].size) {
633         /* if A[-1] <= B[0] then sorted */
634         if ((*cmp)(arr + (stack[i].start - 1) * size, arr + (stack[i].start) * size, ctx) > 0) {
635           m = stack[i].start;
636           /* Search A for B[0] insertion */
637           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i - 1].start, stack[i].start - 1, (arr) + stack[i].start * size, &l));
638           /* Search B for A[-1] insertion */
639           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, arr + (stack[i].start - 1) * size, &r));
640           if (m - l <= r - m) {
641             PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
642             PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
643           } else {
644             PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
645             PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
646           }
647         }
648         /* Update A with merge */
649         stack[i - 1].size += stack[i].size;
650         --i;
651       }
652     } else {
653       /* i > 2, i.e. C exists
654        A = stack[i-2], B = stack[i-1], C = stack[i]; */
655       if (stack[i - 2].size <= stack[i - 1].size + stack[i].size) {
656         if (stack[i - 2].size < stack[i].size) {
657           /* merge B into A, but if A[-1] <= B[0] then already sorted */
658           if ((*cmp)(arr + (stack[i - 1].start - 1) * size, arr + (stack[i - 1].start) * size, ctx) > 0) {
659             m = stack[i - 1].start;
660             /* Search A for B[0] insertion */
661             PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i - 2].start, stack[i - 1].start - 1, (arr) + (stack[i - 1].start) * size, &l));
662             /* Search B for A[-1] insertion */
663             PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i - 1].start, stack[i - 1].start + stack[i - 1].size - 1, (arr) + (stack[i - 1].start - 1) * size, &r));
664             if (m - l <= r - m) {
665               PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
666               PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
667             } else {
668               PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
669               PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
670             }
671           }
672           /* Update A with merge */
673           stack[i - 2].size += stack[i - 1].size;
674           /* Push C up the stack */
675           stack[i - 1].start = stack[i].start;
676           stack[i - 1].size  = stack[i].size;
677         } else {
678         /* merge C into B */
679         mergeBC:
680           /* If B[-1] <= C[0] then... you know the drill */
681           if ((*cmp)(arr + (stack[i].start - 1) * size, arr + (stack[i].start) * size, ctx) > 0) {
682             m = stack[i].start;
683             /* Search B for C[0] insertion */
684             PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i - 1].start, stack[i].start - 1, arr + stack[i].start * size, &l));
685             /* Search C for B[-1] insertion */
686             PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, (arr) + (stack[i].start - 1) * size, &r));
687             if (m - l <= r - m) {
688               PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
689               PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
690             } else {
691               PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
692               PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
693             }
694           }
695           /* Update B with merge */
696           stack[i - 1].size += stack[i].size;
697         }
698         --i;
699       } else if (stack[i - 1].size <= stack[i].size) {
700         /* merge C into B */
701         goto mergeBC;
702       }
703     }
704     if (itemp == i) break;
705   }
706   *stacksize = i;
707   PetscFunctionReturn(PETSC_SUCCESS);
708 }
709 
710 static inline PetscErrorCode PetscTimSortMergeCollapseWithArray_Private(char *arr, size_t asize, char *barr, size_t bsize, CompFunc cmp, void *ctx, PetscTimSortBuffer *abuff, PetscTimSortBuffer *bbuff, PetscTimSortStack *stack, PetscInt *stacksize)
711 {
712   PetscInt i = *stacksize;
713 
714   PetscFunctionBegin;
715   while (i) {
716     PetscInt l, m, r, itemp = i;
717 
718     if (i == 1) {
719       /* A = stack[i-1], B = stack[i] */
720       if (stack[i - 1].size < stack[i].size) {
721         /* if A[-1] <= B[0] then sorted */
722         if ((*cmp)(arr + (stack[i].start - 1) * asize, arr + (stack[i].start) * asize, ctx) > 0) {
723           m = stack[i].start;
724           /* Search A for B[0] insertion */
725           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i - 1].start, stack[i].start - 1, (arr) + stack[i].start * asize, &l));
726           /* Search B for A[-1] insertion */
727           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, arr + (stack[i].start - 1) * asize, &r));
728           if (m - l <= r - m) {
729             PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
730             PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
731             PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
732           } else {
733             PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
734             PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
735             PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
736           }
737         }
738         /* Update A with merge */
739         stack[i - 1].size += stack[i].size;
740         --i;
741       }
742     } else {
743       /* i > 2, i.e. C exists
744        A = stack[i-2], B = stack[i-1], C = stack[i]; */
745       if (stack[i - 2].size <= stack[i - 1].size + stack[i].size) {
746         if (stack[i - 2].size < stack[i].size) {
747           /* merge B into A, but if A[-1] <= B[0] then already sorted */
748           if ((*cmp)(arr + (stack[i - 1].start - 1) * asize, arr + (stack[i - 1].start) * asize, ctx) > 0) {
749             m = stack[i - 1].start;
750             /* Search A for B[0] insertion */
751             PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i - 2].start, stack[i - 1].start - 1, (arr) + (stack[i - 1].start) * asize, &l));
752             /* Search B for A[-1] insertion */
753             PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i - 1].start, stack[i - 1].start + stack[i - 1].size - 1, (arr) + (stack[i - 1].start - 1) * asize, &r));
754             if (m - l <= r - m) {
755               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
756               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
757               PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
758             } else {
759               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
760               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
761               PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
762             }
763           }
764           /* Update A with merge */
765           stack[i - 2].size += stack[i - 1].size;
766           /* Push C up the stack */
767           stack[i - 1].start = stack[i].start;
768           stack[i - 1].size  = stack[i].size;
769         } else {
770         /* merge C into B */
771         mergeBC:
772           /* If B[-1] <= C[0] then... you know the drill */
773           if ((*cmp)(arr + (stack[i].start - 1) * asize, arr + (stack[i].start) * asize, ctx) > 0) {
774             m = stack[i].start;
775             /* Search B for C[0] insertion */
776             PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i - 1].start, stack[i].start - 1, arr + stack[i].start * asize, &l));
777             /* Search C for B[-1] insertion */
778             PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, (arr) + (stack[i].start - 1) * asize, &r));
779             if (m - l <= r - m) {
780               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
781               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
782               PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
783             } else {
784               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
785               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
786               PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
787             }
788           }
789           /* Update B with merge */
790           stack[i - 1].size += stack[i].size;
791         }
792         --i;
793       } else if (stack[i - 1].size <= stack[i].size) {
794         /* merge C into B */
795         goto mergeBC;
796       }
797     }
798     if (itemp == i) break;
799   }
800   *stacksize = i;
801   PetscFunctionReturn(PETSC_SUCCESS);
802 }
803 
804 /* March sequentially through the array building up a "run" of weakly increasing or strictly decreasing contiguous
805  elements. Decreasing runs are reversed by swapping. If the run is less than minrun, artificially extend it via either
806  binary insertion sort or regulat insertion sort */
807 static inline PetscErrorCode PetscTimSortBuildRun_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt n, PetscInt minrun, PetscInt runstart, PetscInt *runend)
808 {
809   const PetscInt re = PetscMin(runstart + minrun, n - 1);
810   PetscInt       ri = runstart;
811 
812   PetscFunctionBegin;
813   if (PetscUnlikely(runstart == n - 1)) {
814     *runend = runstart;
815     PetscFunctionReturn(PETSC_SUCCESS);
816   }
817   /* guess whether run is ascending or descending and tally up the longest consecutive run. essentially a coinflip for random data */
818   if ((*cmp)((arr) + (ri + 1) * size, (arr) + ri * size, ctx) < 0) {
819     ++ri;
820     while (ri < n - 1) {
821       if ((*cmp)((arr) + (ri + 1) * size, (arr) + ri * size, ctx) >= 0) break;
822       ++ri;
823     }
824     {
825       PetscInt lo = runstart, hi = ri;
826       for (; lo < hi; ++lo, --hi) COPYSWAPPY(arr + lo * size, arr + hi * size, tarr, size);
827     }
828   } else {
829     ++ri;
830     while (ri < n - 1) {
831       if ((*cmp)((arr) + (ri + 1) * size, (arr) + ri * size, ctx) < 0) break;
832       ++ri;
833     }
834   }
835   if (ri < re) {
836     /* the attempt failed, this section likely contains random data. If ri got close to minrun (within 50%) then we try
837      binary search */
838     if (ri - runstart <= minrun >> 1) {
839       ++MIN_GALLOP_GLOBAL; /* didn't get close hedge our bets against random data */
840       PetscCall(PetscInsertionSort_Private(arr, tarr, size, cmp, ctx, runstart, ri, re));
841     } else {
842       PetscCall(PetscBinaryInsertionSort_Private(arr, tarr, size, cmp, ctx, runstart, ri, re));
843     }
844     *runend = re;
845   } else *runend = ri;
846   PetscFunctionReturn(PETSC_SUCCESS);
847 }
848 
849 static inline PetscErrorCode PetscTimSortBuildRunWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt n, PetscInt minrun, PetscInt runstart, PetscInt *runend)
850 {
851   const PetscInt re = PetscMin(runstart + minrun, n - 1);
852   PetscInt       ri = runstart;
853 
854   PetscFunctionBegin;
855   if (PetscUnlikely(runstart == n - 1)) {
856     *runend = runstart;
857     PetscFunctionReturn(PETSC_SUCCESS);
858   }
859   /* guess whether run is ascending or descending and tally up the longest consecutive run. essentially a coinflip for random data */
860   if ((*cmp)((arr) + (ri + 1) * asize, arr + (ri * asize), ctx) < 0) {
861     ++ri;
862     while (ri < n - 1) {
863       if ((*cmp)((arr) + (ri + 1) * asize, (arr) + ri * asize, ctx) >= 0) break;
864       ++ri;
865     }
866     {
867       PetscInt lo = runstart, hi = ri;
868       for (; lo < hi; ++lo, --hi) COPYSWAPPY2(arr + lo * asize, arr + hi * asize, asize, barr + lo * bsize, barr + hi * bsize, bsize, atarr);
869     }
870   } else {
871     ++ri;
872     while (ri < n - 1) {
873       if ((*cmp)((arr) + (ri + 1) * asize, (arr) + ri * asize, ctx) < 0) break;
874       ++ri;
875     }
876   }
877   if (ri < re) {
878     /* the attempt failed, this section likely contains random data. If ri got close to minrun (within 50%) then we try
879      binary search */
880     if (ri - runstart <= minrun >> 1) {
881       ++MIN_GALLOP_GLOBAL; /* didn't get close hedge our bets against random data */
882       PetscCall(PetscInsertionSortWithArray_Private(arr, atarr, asize, barr, btarr, bsize, cmp, ctx, runstart, ri, re));
883     } else {
884       PetscCall(PetscBinaryInsertionSortWithArray_Private(arr, atarr, asize, barr, btarr, bsize, cmp, ctx, runstart, ri, re));
885     }
886     *runend = re;
887   } else *runend = ri;
888   PetscFunctionReturn(PETSC_SUCCESS);
889 }
890 
891 /*@C
892   PetscTimSort - Sorts an array in place in increasing order using Tim Peters <https://bugs.python.org/file4451/timsort.txt> adaptive sorting algorithm.
893 
894   Not Collective
895 
896   Input Parameters:
897 + n    - number of values
898 . arr  - array to be sorted
899 . size - size in bytes of the datatype held in arr
900 . cmp  - function pointer to comparison function
901 - ctx  - optional context to be passed to comparison function, NULL if not needed
902 
903   Output Parameter:
904 . arr - sorted array
905 
906   Level: developer
907 
908   Notes:
909   Timsort makes the assumption that input data is already likely partially ordered, or that it contains contiguous
910   sections (termed 'runs') where the data is locally ordered (but not necessarily globally ordered). It therefore aims to
911   select slices of the array in such a way that resulting mergesorts operate on near perfectly length-balanced arrays. To
912   do so it repeatedly triggers attempts throughout to merge adjacent runs.
913 
914   Should one run continuously "win" a comparison the algorithm begins the "gallop" phase. It will aggressively search
915   the "winner" for the location of the "losers" next entry (and vice versa) to copy all preceding elements into place in
916   bulk. However if the data is truly unordered (as is the case with random data) the immense gains possible from these
917   searches are expected __not__ to repay their costs. While adjacent arrays are almost all nearly the same size, they
918   likely all contain similar data.
919 
920   Example Usage:
921   The comparison function must follow the `qsort()` comparison function paradigm, returning the sign of the difference
922   between its arguments. If left < right : return -1, if left == right : return 0, if left > right : return 1. The user
923   may also
924   change or reverse the order of the sort by flipping the above. Note that stability of the sort is only guaranteed if
925   the comparison function forms a valid trigraph. For example when sorting an array of type "my_type" in increasing
926   order
927 
928 .vb
929   int increasing_comparison_function(const void *left, const void *right, void *ctx) {
930     my_type l = *(my_type *) left, r = *(my_type *) right;
931     return (l < r) ? -1 : (l > r);
932   }
933 .ve
934 
935   Note the context is unused here but you may use it to pass and subsequently access whatever information required
936   inside the comparison function. The context pointer will unaltered except for any changes made inside the comparison function.
937   Then pass the function
938 .vb
939   PetscTimSort(n, arr, sizeof(arr[0]), increasing_comparison_function, ctx)
940 .ve
941 
942   Fortran Notes:
943   To use this from Fortran you must write a comparison subroutine with 4 arguments which accepts left, right, ctx and
944   returns result. For example
945 .vb
946  subroutine CompareIntegers(left,right,ctx,result)
947    implicit none
948 
949    PetscInt,intent(in) :: left, right
950    type(UserCtx)       :: ctx
951    integer,intent(out) :: result
952 
953    if (left < right) then
954      result = -1
955    else if (left == right) then
956      result = 0
957    else
958      result = 1
959    end if
960    return
961  end subroutine CompareIntegers
962 .ve
963 
964 .seealso: `PetscTimSortWithArray()`, `PetscIntSortSemiOrdered()`, `PetscRealSortSemiOrdered()`, `PetscMPIIntSortSemiOrdered()`
965 @*/
966 PetscErrorCode PetscTimSort(PetscInt n, void *arr, size_t size, int (*cmp)(const void *, const void *, void *), void *ctx)
967 {
968   PetscInt           stacksize = 0, minrun, runstart = 0, runend = 0;
969   PetscTimSortStack  runstack[128];
970   PetscTimSortBuffer buff;
971   /* stacksize  = log_phi(n) = log_2(n)/log_2(phi), so 128 is enough for ~5.614e26 elements.
972    It is so unlikely that this limit is reached that this is __never__ checked for */
973 
974   PetscFunctionBegin;
975   /* Compute minrun. Minrun should be (32, 65) such that N/minrun
976    is a power of 2 or one plus a power of 2 */
977   {
978     PetscInt t = n, r = 0;
979     /* r becomes 1 if the least significant bits contain at least one off bit */
980     while (t >= 64) {
981       r |= t & 1;
982       t >>= 1;
983     }
984     minrun = t + r;
985   }
986   if (PetscDefined(USE_DEBUG)) {
987     PetscCall(PetscInfo(NULL, "minrun = %" PetscInt_FMT "\n", minrun));
988     if (n < 64) {
989       PetscCall(PetscInfo(NULL, "n %" PetscInt_FMT " < 64, consider using PetscSortInt() instead\n", n));
990     } else PetscCheck(!(minrun < 32) && !(minrun > 65), PETSC_COMM_SELF, PETSC_ERR_PLIB, "Calculated minrun %" PetscInt_FMT " not in range (32,65)", minrun);
991   }
992   PetscCall(PetscMalloc1((size_t)minrun * size, &buff.ptr));
993   buff.size         = (size_t)minrun * size;
994   buff.maxsize      = (size_t)n * size;
995   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
996   while (runstart < n) {
997     /* Check if additional entries are at least partially ordered and build natural run */
998     PetscCall(PetscTimSortBuildRun_Private((char *)arr, buff.ptr, size, cmp, ctx, n, minrun, runstart, &runend));
999     runstack[stacksize].start = runstart;
1000     runstack[stacksize].size  = runend - runstart + 1;
1001     PetscCall(PetscTimSortMergeCollapse_Private((char *)arr, size, cmp, ctx, &buff, runstack, &stacksize));
1002     ++stacksize;
1003     runstart = runend + 1;
1004   }
1005   /* Have been inside while, so discard last stacksize++ */
1006   --stacksize;
1007   PetscCall(PetscTimSortForceCollapse_Private((char *)arr, size, cmp, ctx, &buff, runstack, stacksize));
1008   PetscCall(PetscFree(buff.ptr));
1009   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
1010   PetscFunctionReturn(PETSC_SUCCESS);
1011 }
1012 
1013 /*@C
1014   PetscTimSortWithArray - Sorts an array in place in increasing order using Tim Peters <https://bugs.python.org/file4451/timsort.txt> adaptive sorting algorithm and
1015   reorders a second array to match the first. The arrays need not be the same type.
1016 
1017   Not Collective
1018 
1019   Input Parameters:
1020 + n     - number of values
1021 . asize - size in bytes of the datatype held in arr
1022 . bsize - size in bytes of the datatype held in barr
1023 . cmp   - function pointer to comparison function
1024 - ctx   - optional context to be passed to comparison function, NULL if not needed
1025 
1026   Input/Output Parameters:
1027 + arr  - array to be sorted, on output it is sorted
1028 - barr - array to be reordered, on output it is reordered
1029 
1030   Level: developer
1031 
1032   Notes:
1033   The arrays need not be of the same type, however barr MUST contain at least as many elements as arr and the two CANNOT
1034   overlap.
1035 
1036   Timsort makes the assumption that input data is already likely partially ordered, or that it contains contiguous
1037   sections (termed 'runs') where the data is locally ordered (but not necessarily globally ordered). It therefore aims
1038   to select slices of the array in such a way that resulting mergesorts operate on near perfectly length-balanced
1039   arrays. To do so it repeatedly triggers attempts throughout to merge adjacent runs.
1040 
1041   Should one run continuously "win" a comparison the algorithm begins the "gallop" phase. It will aggressively search
1042   the "winner" for the location of the "losers" next entry (and vice versa) to copy all preceding elements into place in
1043   bulk. However if the data is truly unordered (as is the case with random data) the immense gains possible from these
1044   searches are expected __not__ to repay their costs. While adjacent arrays are almost all nearly the same size, they
1045   likely all contain similar data.
1046 
1047   Example Usage:
1048   The comparison function must follow the `qsort()` comparison function paradigm, returning the sign of the difference
1049   between its arguments. If left < right \: return -1, if left == right \: return 0, if left > right \: return 1. The user
1050   may also change or reverse the order of the sort by flipping the above. Note that stability of the sort is only
1051   guaranteed if the comparison function forms a valid trigraph. For example when sorting an array of type "my_type" in
1052   increasing order
1053 
1054 .vb
1055   int increasing_comparison_function(const void *left, const void *right, void *ctx) {
1056     my_type l = *(my_type *) left, r = *(my_type *) right;
1057     return (l < r) ? -1 : (l > r);
1058   }
1059 .ve
1060 
1061   Note the context is unused here but you may use it to pass and subsequently access whatever information required
1062   inside the comparison function. The context pointer will unaltered except for any changes made inside the comparison function.
1063   Then pass the function
1064 .vb
1065   PetscTimSortWithArray(n, arr, sizeof(arr[0]), barr, sizeof(barr[0]), increasing_comparison_function, ctx)
1066 .ve
1067 
1068   Fortran Notes:
1069   To use this from Fortran you must write a comparison subroutine with 4 arguments which accepts left, right, ctx and
1070   returns result. For example
1071 .vb
1072  subroutine CompareIntegers(left,right,ctx,result)
1073    implicit none
1074 
1075    PetscInt,intent(in) :: left, right
1076    type(UserCtx)       :: ctx
1077    integer,intent(out) :: result
1078 
1079    if (left < right) then
1080      result = -1
1081    else if (left == right) then
1082      result = 0
1083    else
1084      result = 1
1085    end if
1086    return
1087  end subroutine CompareIntegers
1088 .ve
1089 
1090 .seealso: `PetscTimSort()`, `PetscIntSortSemiOrderedWithArray()`, `PetscRealSortSemiOrderedWithArrayInt()`, `PetscMPIIntSortSemiOrderedWithArray()`
1091 @*/
1092 PetscErrorCode PetscTimSortWithArray(PetscInt n, void *arr, size_t asize, void *barr, size_t bsize, int (*cmp)(const void *, const void *, void *), void *ctx)
1093 {
1094   PetscInt           stacksize = 0, minrun, runstart = 0, runend = 0;
1095   PetscTimSortStack  runstack[128];
1096   PetscTimSortBuffer abuff, bbuff;
1097   /* stacksize  = log_phi(n) = log_2(n)/log_2(phi), so 128 is enough for ~5.614e26 elements.
1098    It is so unlikely that this limit is reached that this is __never__ checked for */
1099 
1100   PetscFunctionBegin;
1101   /* Compute minrun. Minrun should be (32, 65) such that N/minrun
1102    is a power of 2 or one plus a power of 2 */
1103   {
1104     PetscInt t = n, r = 0;
1105     /* r becomes 1 if the least significant bits contain at least one off bit */
1106     while (t >= 64) {
1107       r |= t & 1;
1108       t >>= 1;
1109     }
1110     minrun = t + r;
1111   }
1112   if (PetscDefined(USE_DEBUG)) {
1113     PetscCall(PetscInfo(NULL, "minrun = %" PetscInt_FMT "\n", minrun));
1114     PetscCheck(n < 64 || (minrun >= 32 && minrun <= 65), PETSC_COMM_SELF, PETSC_ERR_PLIB, "Calculated minrun %" PetscInt_FMT " not in range (32,65)", minrun);
1115   }
1116   PetscCall(PetscMalloc1((size_t)minrun * asize, &abuff.ptr));
1117   abuff.size    = (size_t)minrun * asize;
1118   abuff.maxsize = (size_t)n * asize;
1119   PetscCall(PetscMalloc1((size_t)minrun * bsize, &bbuff.ptr));
1120   bbuff.size        = (size_t)minrun * bsize;
1121   bbuff.maxsize     = (size_t)n * bsize;
1122   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
1123   while (runstart < n) {
1124     /* Check if additional entries are at least partially ordered and build natural run */
1125     PetscCall(PetscTimSortBuildRunWithArray_Private((char *)arr, abuff.ptr, asize, (char *)barr, bbuff.ptr, bsize, cmp, ctx, n, minrun, runstart, &runend));
1126     runstack[stacksize].start = runstart;
1127     runstack[stacksize].size  = runend - runstart + 1;
1128     PetscCall(PetscTimSortMergeCollapseWithArray_Private((char *)arr, asize, (char *)barr, bsize, cmp, ctx, &abuff, &bbuff, runstack, &stacksize));
1129     ++stacksize;
1130     runstart = runend + 1;
1131   }
1132   /* Have been inside while, so discard last stacksize++ */
1133   --stacksize;
1134   PetscCall(PetscTimSortForceCollapseWithArray_Private((char *)arr, asize, (char *)barr, bsize, cmp, ctx, &abuff, &bbuff, runstack, stacksize));
1135   PetscCall(PetscFree(abuff.ptr));
1136   PetscCall(PetscFree(bbuff.ptr));
1137   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
1138   PetscFunctionReturn(PETSC_SUCCESS);
1139 }
1140 
1141 /*@
1142   PetscIntSortSemiOrdered - Sorts an array of `PetscInt` in place in increasing order.
1143 
1144   Not Collective
1145 
1146   Input Parameters:
1147 + n   - number of values
1148 - arr - array of integers
1149 
1150   Output Parameter:
1151 . arr - sorted array of integers
1152 
1153   Level: intermediate
1154 
1155   Notes:
1156   If the array is less than 64 entries long `PetscSortInt()` is automatically used.
1157 
1158   This function serves as an alternative to `PetscSortInt()`. While this function works for any array of integers it is
1159   significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1160   recommended that the user benchmark their code to see which routine is fastest.
1161 
1162 .seealso: `PetscTimSort()`, `PetscSortInt()`, `PetscSortIntWithPermutation()`
1163 @*/
1164 PetscErrorCode PetscIntSortSemiOrdered(PetscInt n, PetscInt arr[])
1165 {
1166   PetscFunctionBegin;
1167   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1168   PetscAssertPointer(arr, 2);
1169   if (n < 64) {
1170     PetscCall(PetscSortInt(n, arr));
1171   } else {
1172     PetscCall(PetscTimSort(n, arr, sizeof(PetscInt), Compare_PetscInt_Private, NULL));
1173   }
1174   PetscFunctionReturn(PETSC_SUCCESS);
1175 }
1176 
1177 /*@
1178   PetscIntSortSemiOrderedWithArray - Sorts an array of `PetscInt` in place in increasing order and reorders a second
1179   `PetscInt` array to match the first.
1180 
1181   Not Collective
1182 
1183   Input Parameter:
1184 . n - number of values
1185 
1186   Input/Output Parameters:
1187 + arr1 - array of integers to be sorted, modified on output
1188 - arr2 - array of integers to be reordered, modified on output
1189 
1190   Level: intermediate
1191 
1192   Notes:
1193   The arrays CANNOT overlap.
1194 
1195   This function serves as an alternative to `PetscSortIntWithArray()`. While this function works for any array of integers it is
1196   significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1197   recommended that the user benchmark their code to see which routine is fastest.
1198 
1199 .seealso: `PetscTimSortWithArray()`, `PetscSortIntWithArray()`, `PetscSortIntWithPermutation()`
1200 @*/
1201 PetscErrorCode PetscIntSortSemiOrderedWithArray(PetscInt n, PetscInt arr1[], PetscInt arr2[])
1202 {
1203   PetscFunctionBegin;
1204   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1205   PetscAssertPointer(arr1, 2);
1206   PetscAssertPointer(arr2, 3);
1207   /* cannot export out to PetscIntSortWithArray here since it isn't stable */
1208   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscInt), arr2, sizeof(PetscInt), Compare_PetscInt_Private, NULL));
1209   PetscFunctionReturn(PETSC_SUCCESS);
1210 }
1211 
1212 /*@
1213   PetscMPIIntSortSemiOrdered - Sorts an array of `PetscMPIInt` in place in increasing order.
1214 
1215   Not Collective
1216 
1217   Input Parameters:
1218 + n   - number of values
1219 - arr - array of `PetscMPIInt`
1220 
1221   Output Parameter:
1222 . arr - sorted array of integers
1223 
1224   Level: intermediate
1225 
1226   Notes:
1227   If the array is less than 64 entries long `PetscSortMPIInt()` is automatically used.
1228 
1229   This function serves as an alternative to `PetscSortMPIInt()`. While this function works for any array of `PetscMPIInt` it is
1230   significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1231   recommended that the user benchmark their code to see which routine is fastest.
1232 
1233 .seealso: `PetscTimSort()`, `PetscSortMPIInt()`
1234 @*/
1235 PetscErrorCode PetscMPIIntSortSemiOrdered(PetscInt n, PetscMPIInt arr[])
1236 {
1237   PetscFunctionBegin;
1238   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1239   PetscAssertPointer(arr, 2);
1240   if (n < 64) {
1241     PetscCall(PetscSortMPIInt(n, arr));
1242   } else {
1243     PetscCall(PetscTimSort(n, arr, sizeof(PetscMPIInt), Compare_PetscMPIInt_Private, NULL));
1244   }
1245   PetscFunctionReturn(PETSC_SUCCESS);
1246 }
1247 
1248 /*@
1249   PetscMPIIntSortSemiOrderedWithArray - Sorts an array of `PetscMPIInt` in place in increasing order and reorders a second `PetscMPIInt`
1250   array to match the first.
1251 
1252   Not Collective
1253 
1254   Input Parameter:
1255 . n - number of values
1256 
1257   Input/Output Parameters:
1258 + arr1 - array of integers to be sorted, modified on output
1259 - arr2 - array of integers to be reordered, modified on output
1260 
1261   Level: intermediate
1262 
1263   Notes:
1264   The arrays CANNOT overlap.
1265 
1266   This function serves as an alternative to `PetscSortMPIIntWithArray()`. While this function works for any array of integers it is
1267   significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1268   recommended that the user benchmark their code to see which routine is fastest.
1269 
1270 .seealso: `PetscTimSortWithArray()`, `PetscSortMPIIntWithArray()`, `PetscSortMPIIntWithPermutation()`
1271 @*/
1272 PetscErrorCode PetscMPIIntSortSemiOrderedWithArray(PetscInt n, PetscMPIInt arr1[], PetscMPIInt arr2[])
1273 {
1274   PetscFunctionBegin;
1275   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1276   PetscAssertPointer(arr1, 2);
1277   PetscAssertPointer(arr2, 3);
1278   /* cannot export out to PetscMPIIntSortWithArray here since it isn't stable */
1279   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscMPIInt), arr2, sizeof(PetscMPIInt), Compare_PetscMPIInt_Private, NULL));
1280   PetscFunctionReturn(PETSC_SUCCESS);
1281 }
1282 
1283 /*@
1284   PetscRealSortSemiOrdered - Sorts an array of `PetscReal` in place in increasing order.
1285 
1286   Not Collective
1287 
1288   Input Parameters:
1289 + n   - number of values
1290 - arr - array of `PetscReal`
1291 
1292   Output Parameter:
1293 . arr - sorted array of integers
1294 
1295   Level: intermediate
1296 
1297   Notes:
1298   If the array is less than 64 entries long `PetscSortReal()` is automatically used.
1299 
1300   This function serves as an alternative to `PetscSortReal()`. While this function works for any array of `PetscReal` it is
1301   significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1302   recommended that the user benchmark their code to see which routine is fastest.
1303 
1304 .seealso: `PetscTimSort()`, `PetscSortReal()`, `PetscSortRealWithPermutation()`
1305 @*/
1306 PetscErrorCode PetscRealSortSemiOrdered(PetscInt n, PetscReal arr[])
1307 {
1308   PetscFunctionBegin;
1309   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1310   PetscAssertPointer(arr, 2);
1311   if (n < 64) {
1312     PetscCall(PetscSortReal(n, arr));
1313   } else {
1314     PetscCall(PetscTimSort(n, arr, sizeof(PetscReal), Compare_PetscReal_Private, NULL));
1315   }
1316   PetscFunctionReturn(PETSC_SUCCESS);
1317 }
1318 
1319 /*@
1320   PetscRealSortSemiOrderedWithArrayInt - Sorts an array of `PetscReal` in place in increasing order and reorders a second
1321   array of `PetscInt` to match the first.
1322 
1323   Not Collective
1324 
1325   Input Parameter:
1326 . n - number of values
1327 
1328   Input/Output Parameters:
1329 + arr1 - array of `PetscReal` to be sorted, modified on output
1330 - arr2 - array of `PetscInt` to be reordered, modified on output
1331 
1332   Level: intermediate
1333 
1334   Notes:
1335   This function serves as an alternative to `PetscSortRealWithArray()`. While this function works for any array of `PetscReal` it is
1336   significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1337   recommended that the user benchmark their code to see which routine is fastest.
1338 
1339 .seealso: `PetscTimSortWithArray()`, `PetscSortRealWithArrayInt()`, `PetscSortRealWithPermutation()`
1340 @*/
1341 PetscErrorCode PetscRealSortSemiOrderedWithArrayInt(PetscInt n, PetscReal arr1[], PetscInt arr2[])
1342 {
1343   PetscFunctionBegin;
1344   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1345   PetscAssertPointer(arr1, 2);
1346   PetscAssertPointer(arr2, 3);
1347   /* cannot export out to PetscRealSortWithArrayInt here since it isn't stable */
1348   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscReal), arr2, sizeof(PetscInt), Compare_PetscReal_Private, NULL));
1349   PetscFunctionReturn(PETSC_SUCCESS);
1350 }
1351