xref: /petsc/src/sys/utils/sortso.c (revision 3307d110e72ee4e6d2468971620073eb5ff93529)
1 #include <petscsys.h>                /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 static inline int Compare_PetscMPIInt_Private(const void *left, const void *right, PETSC_UNUSED void *ctx)
5 {
6   PetscMPIInt l = *(PetscMPIInt *) left, r = *(PetscMPIInt *) right;
7   return (l < r) ? -1 : (l > r);
8 }
9 
10 static inline int Compare_PetscInt_Private(const void *left, const void *right, PETSC_UNUSED void *ctx)
11 {
12   PetscInt l = *(PetscInt *) left, r = *(PetscInt *) right;
13   return (l < r) ? -1 : (l > r);
14 }
15 
16 static inline int Compare_PetscReal_Private(const void *left, const void *right, PETSC_UNUSED void *ctx)
17 {
18   PetscReal l = *(PetscReal *) left, r = *(PetscReal *) right;
19   return (l < r) ? -1 : (l > r);
20 }
21 
22 #define MIN_GALLOP_CONST_GLOBAL 8
23 static PetscInt MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
24 typedef int (*CompFunc)(const void *, const void *, void *);
25 
26 /* Mostly to force clang uses the builtins, but can't hurt to have gcc compilers also do the same */
27 #if !defined (PETSC_USE_DEBUG) && defined(__GNUC__)
28 static inline void COPYSWAPPY(char *a, char *b, char *t, size_t size)
29 {
30   __builtin_memcpy(t, b, size);
31   __builtin_memmove(b, a, size);
32   __builtin_memcpy(a, t, size);
33   return;
34 }
35 
36 static inline void COPYSWAPPY2(char *al, char *ar, size_t asize, char *bl, char *br, size_t bsize, char *t)
37 {
38   __builtin_memcpy(t, ar, asize);
39   __builtin_memmove(ar, al, asize);
40   __builtin_memcpy(al, t, asize);
41   __builtin_memcpy(t, br, bsize);
42   __builtin_memmove(br, bl, bsize);
43   __builtin_memcpy(bl, t, bsize);
44   return;
45 }
46 
47 static inline void Petsc_memcpy(char *dest, const char *src, size_t size)
48 {
49   __builtin_memcpy(dest, src, size);
50   return;
51 }
52 
53 static inline void Petsc_memcpy2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
54 {
55   __builtin_memcpy(adest, asrc, asize);
56   __builtin_memcpy(bdest, bsrc, bsize);
57   return;
58 }
59 
60 static inline void Petsc_memmove(char *dest, const char *src, size_t size)
61 {
62   __builtin_memmove(dest, src, size);
63   return;
64 }
65 
66 static inline void Petsc_memmove2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
67 {
68   __builtin_memmove(adest, asrc, asize);
69   __builtin_memmove(bdest, bsrc, bsize);
70   return;
71 }
72 # else
73 static inline void COPYSWAPPY(char *a, char *b, char *t, size_t size)
74 {
75   PetscFunctionBegin;
76   PetscCallAbort(PETSC_COMM_SELF,PetscMemcpy(t, b, size));
77   PetscCallAbort(PETSC_COMM_SELF,PetscMemmove(b, a, size));
78   PetscCallAbort(PETSC_COMM_SELF,PetscMemcpy(a, t, size));
79   PetscFunctionReturnVoid();
80 }
81 
82 static inline void COPYSWAPPY2(char *al, char *ar, size_t asize, char *bl, char *br, size_t bsize, char *t)
83 {
84   PetscFunctionBegin;
85   PetscCallAbort(PETSC_COMM_SELF,PetscMemcpy(t, ar, asize));
86   PetscCallAbort(PETSC_COMM_SELF,PetscMemmove(ar, al, asize));
87   PetscCallAbort(PETSC_COMM_SELF,PetscMemcpy(al, t, asize));
88   PetscCallAbort(PETSC_COMM_SELF,PetscMemcpy(t, br, bsize));
89   PetscCallAbort(PETSC_COMM_SELF,PetscMemmove(br, bl, bsize));
90   PetscCallAbort(PETSC_COMM_SELF,PetscMemcpy(bl, t, bsize));
91   PetscFunctionReturnVoid();
92 }
93 
94 static inline void Petsc_memcpy(char *dest, const char *src, size_t size)
95 {
96   PetscFunctionBegin;
97   PetscCallAbort(PETSC_COMM_SELF,PetscMemcpy(dest, src, size));
98   PetscFunctionReturnVoid();
99 }
100 
101 static inline void Petsc_memcpy2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
102 {
103   PetscFunctionBegin;
104   PetscCallAbort(PETSC_COMM_SELF,PetscMemcpy(adest, asrc, asize));
105   PetscCallAbort(PETSC_COMM_SELF,PetscMemcpy(bdest, bsrc, bsize));
106   PetscFunctionReturnVoid();
107 }
108 
109 static inline void Petsc_memmove(char *dest, const char *src, size_t size)
110 {
111   PetscFunctionBegin;
112   PetscCallAbort(PETSC_COMM_SELF,PetscMemmove(dest, src, size));
113   PetscFunctionReturnVoid();
114 }
115 
116 static inline void Petsc_memmove2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
117 {
118   PetscFunctionBegin;
119   PetscCallAbort(PETSC_COMM_SELF,PetscMemmove(adest, asrc, asize));
120   PetscCallAbort(PETSC_COMM_SELF,PetscMemmove(bdest, bsrc, bsize));
121   PetscFunctionReturnVoid();
122 }
123 #endif
124 
125 /* Start left look right. Looking for e.g. B[0] in A or mergelo. l inclusive, r inclusive. Returns first m such that arr[m] >
126  x. Output also inclusive.
127 
128  NOTE: Both gallopsearch functions CANNOT distinguish between inserting AFTER the entire array vs inserting at entry n!! For example for an array:
129 
130    {0,1,2,3,4,5}
131 
132    when looking to insert "5" this routine will return *m = 6, but when looking to insert "6" it will ALSO return *m = 6.
133   */
134 static inline PetscErrorCode PetscGallopSearchLeft_Private(const char *arr, size_t size, CompFunc cmp, void *ctx, PetscInt l, PetscInt r, const char *x, PetscInt *m)
135 {
136   PetscInt last = l, k = 1, mid, cur = l+1;
137 
138   PetscFunctionBegin;
139   *m = l;
140   PetscAssert(r >= l,PETSC_COMM_SELF,PETSC_ERR_PLIB,"r %" PetscInt_FMT " < l %" PetscInt_FMT " in PetscGallopSearchLeft",r,l);
141   if ((*cmp)(x, arr+r*size, ctx) >= 0) {*m = r; PetscFunctionReturn(0);}
142   if ((*cmp)(x, (arr)+l*size, ctx) < 0 || PetscUnlikely(!(r-l))) PetscFunctionReturn(0);
143   while (PETSC_TRUE) {
144     if (PetscUnlikely(cur > r)) {cur = r; break;}
145     if ((*cmp)(x, (arr)+cur*size, ctx) < 0) break;
146     last = cur;
147     cur += (k <<= 1) + 1;
148     ++k;
149   }
150   /* standard binary search but take last 0 mid 0 cur 1 into account*/
151   while (cur > last + 1) {
152     mid = last + ((cur - last) >> 1);
153     if ((*cmp)(x, (arr)+mid*size, ctx) < 0) {
154       cur = mid;
155     } else {
156       last = mid;
157     }
158   }
159   *m = cur;
160   PetscFunctionReturn(0);
161 }
162 
163 /* Start right look left. Looking for e.g. A[-1] in B or mergehi. l inclusive, r inclusive. Returns last m such that arr[m]
164  < x. Output also inclusive */
165 static inline PetscErrorCode PetscGallopSearchRight_Private(const char *arr, size_t size, CompFunc cmp, void *ctx, PetscInt l, PetscInt r, const char *x, PetscInt *m)
166 {
167   PetscInt last = r, k = 1, mid, cur = r-1;
168 
169   PetscFunctionBegin;
170   *m = r;
171   PetscAssert(r >= l,PETSC_COMM_SELF,PETSC_ERR_PLIB,"r %" PetscInt_FMT " < l %" PetscInt_FMT " in PetscGallopSearchRight",r,l);
172   if ((*cmp)(x, arr+l*size, ctx) <= 0) {*m = l; PetscFunctionReturn(0);}
173   if ((*cmp)(x, (arr)+r*size, ctx) > 0 || PetscUnlikely(!(r-l))) PetscFunctionReturn(0);
174   while (PETSC_TRUE) {
175     if (PetscUnlikely(cur < l)) {cur = l; break;}
176     if ((*cmp)(x, (arr)+cur*size, ctx) > 0) break;
177     last = cur;
178     cur -= (k <<= 1) + 1;
179     ++k;
180   }
181   /* standard binary search but take last r-1 mid r-1 cur r-2 into account*/
182   while (last > cur + 1) {
183     mid = last - ((last - cur) >> 1);
184     if ((*cmp)(x, (arr)+mid*size, ctx) > 0) {
185       cur = mid;
186     } else {
187       last = mid;
188     }
189   }
190   *m = cur;
191   PetscFunctionReturn(0);
192 }
193 
194 /* Mergesort where size of left half <= size of right half, so mergesort is done left to right. Arr should be pointer to
195  complete array, left is first index of left array, mid is first index of right array, right is last index of right
196  array */
197 static inline PetscErrorCode PetscTimSortMergeLo_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
198 {
199   PetscInt       i = 0, j = mid, k = left, gallopleft = 0, gallopright = 0;
200   const PetscInt llen = mid-left;
201 
202   PetscFunctionBegin;
203   Petsc_memcpy(tarr, arr+(left*size), llen*size);
204   while ((i < llen) && (j <= right)) {
205     if ((*cmp)(tarr+(i*size), arr+(j*size), ctx) < 0) {
206       Petsc_memcpy(arr+(k*size), tarr+(i*size), size);
207       ++k;
208       gallopright = 0;
209       if (++i < llen && ++gallopleft >= MIN_GALLOP_GLOBAL) {
210         PetscInt l1, l2, diff1, diff2;
211         do {
212           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
213           /* search temp for right[j], can move up to that of temp into arr immediately */
214           PetscCall(PetscGallopSearchLeft_Private(tarr, size, cmp, ctx, i, llen-1, arr+(j*size), &l1));
215           diff1 = l1-i;
216           Petsc_memcpy(arr+(k*size), tarr+(i*size), diff1*size);
217           k += diff1;
218           i = l1;
219           /* search right for temp[i], can move up to that many of right into arr */
220           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, j, right, tarr+(i*size), &l2));
221           diff2 = l2-j;
222           Petsc_memmove((arr)+k*size, (arr)+j*size, diff2*size);
223           k += diff2;
224           j = l2;
225           if (i >= llen || j > right) break;
226         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
227         ++MIN_GALLOP_GLOBAL;
228       }
229     } else {
230       Petsc_memmove(arr+(k*size), arr+(j*size), size);
231       ++k;
232       gallopleft = 0;
233       if (++j <= right && ++gallopright >= MIN_GALLOP_GLOBAL) {
234         PetscInt l1, l2, diff1, diff2;
235         do {
236           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
237           /* search right for temp[i], can move up to that many of right into arr */
238           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, j, right, tarr+(i*size), &l2));
239           diff2 = l2-j;
240           Petsc_memmove(arr+(k*size), arr+(j*size), diff2*size);
241           k += diff2;
242           j = l2;
243           /* search temp for right[j], can copy up to that of temp into arr immediately */
244           PetscCall(PetscGallopSearchLeft_Private(tarr, size, cmp, ctx, i, llen-1, arr+(j*size), &l1));
245           diff1 = l1-i;
246           Petsc_memcpy(arr+(k*size), tarr+(i*size), diff1*size);
247           k += diff1;
248           i = l1;
249           if (i >= llen || j > right) break;
250         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
251         ++MIN_GALLOP_GLOBAL;
252       }
253     }
254   }
255   if (i<llen) {Petsc_memcpy(arr+(k*size), tarr+(i*size), (llen-i)*size);}
256   PetscFunctionReturn(0);
257 }
258 
259 static inline PetscErrorCode PetscTimSortMergeLoWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
260 {
261   PetscInt       i = 0, j = mid, k = left, gallopleft = 0, gallopright = 0;
262   const PetscInt llen = mid-left;
263 
264   PetscFunctionBegin;
265   Petsc_memcpy2(atarr, arr+(left*asize), llen*asize, btarr, barr+(left*bsize), llen*bsize);
266   while ((i < llen) && (j <= right)) {
267     if ((*cmp)(atarr+(i*asize), arr+(j*asize), ctx) < 0) {
268       Petsc_memcpy2(arr+(k*asize), atarr+(i*asize), asize, barr+(k*bsize), btarr+(i*bsize), bsize);
269       ++k;
270       gallopright = 0;
271       if (++i < llen && ++gallopleft >= MIN_GALLOP_GLOBAL) {
272         PetscInt l1, l2, diff1, diff2;
273         do {
274           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
275           /* search temp for right[j], can move up to that of temp into arr immediately */
276           PetscCall(PetscGallopSearchLeft_Private(atarr, asize, cmp, ctx, i, llen-1, arr+(j*asize), &l1));
277           diff1 = l1-i;
278           Petsc_memcpy2(arr+(k*asize), atarr+(i*asize), diff1*asize, barr+(k*bsize), btarr+(i*bsize), diff1*bsize);
279           k += diff1;
280           i = l1;
281           /* search right for temp[i], can move up to that many of right into arr */
282           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, j, right, atarr+(i*asize), &l2));
283           diff2 = l2-j;
284           Petsc_memmove2(arr+(k*asize), arr+(j*asize), diff2*asize, barr+(k*bsize), barr+(j*bsize), diff2*bsize);
285           k += diff2;
286           j = l2;
287           if (i >= llen || j > right) break;
288         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
289         ++MIN_GALLOP_GLOBAL;
290       }
291     } else {
292       Petsc_memmove2(arr+(k*asize), arr+(j*asize), asize, barr+(k*bsize), barr+(j*bsize), bsize);
293       ++k;
294       gallopleft = 0;
295       if (++j <= right && ++gallopright >= MIN_GALLOP_GLOBAL) {
296         PetscInt l1, l2, diff1, diff2;
297         do {
298           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
299           /* search right for temp[i], can move up to that many of right into arr */
300           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, j, right, atarr+(i*asize), &l2));
301           diff2 = l2-j;
302           Petsc_memmove2(arr+(k*asize), arr+(j*asize), diff2*asize, barr+(k*bsize), barr+(j*bsize), diff2*bsize);
303           k += diff2;
304           j = l2;
305           /* search temp for right[j], can copy up to that of temp into arr immediately */
306           PetscCall(PetscGallopSearchLeft_Private(atarr, asize, cmp, ctx, i, llen-1, arr+(j*asize), &l1));
307           diff1 = l1-i;
308           Petsc_memcpy2(arr+(k*asize), atarr+(i*asize), diff1*asize, barr+(k*bsize), btarr+(i*bsize), diff1*bsize);
309           k += diff1;
310           i = l1;
311           if (i >= llen || j > right) break;
312         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
313         ++MIN_GALLOP_GLOBAL;
314       }
315     }
316   }
317   if (i<llen) {Petsc_memcpy2(arr+(k*asize), atarr+(i*asize), (llen-i)*asize, barr+(k*bsize), btarr+(i*bsize), (llen-i)*bsize);}
318   PetscFunctionReturn(0);
319 }
320 
321 /* Mergesort where size of right half < size of left half, so mergesort is done right to left. Arr should be pointer to
322  complete array, left is first index of left array, mid is first index of right array, right is last index of right
323  array */
324 static inline PetscErrorCode PetscTimSortMergeHi_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
325 {
326   PetscInt       i = right-mid, j = mid-1, k = right, gallopleft = 0, gallopright = 0;
327   const PetscInt rlen = right-mid+1;
328 
329   PetscFunctionBegin;
330   Petsc_memcpy(tarr, (arr)+mid*size, rlen*size);
331   while ((i >= 0) && (j >= left)) {
332     if ((*cmp)((tarr)+i*size, (arr)+j*size, ctx) > 0) {
333       Petsc_memcpy((arr)+k*size, (tarr)+i*size, size);
334       --k;
335       gallopleft = 0;
336       if (--i >= 0 && ++gallopright >= MIN_GALLOP_GLOBAL) {
337         PetscInt l1, l2, diff1, diff2;
338         do {
339           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
340           /* search temp for left[j], can copy up to that many of temp into arr */
341           PetscCall(PetscGallopSearchRight_Private(tarr, size, cmp, ctx, 0, i, (arr)+j*size, &l1));
342           diff1 = i-l1;
343           Petsc_memcpy((arr)+(k-diff1+1)*size, (tarr)+(l1+1)*size, diff1*size);
344           k -= diff1;
345           i = l1;
346           /* search left for temp[i], can move up to that many of left up arr */
347           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, left, j, (tarr)+i*size, &l2));
348           diff2 = j-l2;
349           Petsc_memmove((arr)+(k-diff2+1)*size, (arr)+(l2+1)*size, diff2*size);
350           k -= diff2;
351           j = l2;
352           if (i < 0 || j < left) break;
353         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
354         ++MIN_GALLOP_GLOBAL;
355       }
356     } else {
357       Petsc_memmove((arr)+k*size, (arr)+j*size, size);
358       --k;
359       gallopright = 0;
360       if (--j >= left && ++gallopleft >= MIN_GALLOP_GLOBAL) {
361         PetscInt l1, l2, diff1, diff2;
362         do {
363           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
364           /* search left for temp[i], can move up to that many of left up arr */
365           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, left, j, (tarr)+i*size, &l2));
366           diff2 = j-l2;
367           Petsc_memmove((arr)+(k-diff2+1)*size, (arr)+(l2+1)*size, diff2*size);
368           k -= diff2;
369           j = l2;
370           /* search temp for left[j], can copy up to that many of temp into arr */
371           PetscCall(PetscGallopSearchRight_Private(tarr, size, cmp, ctx, 0, i, (arr)+j*size, &l1));
372           diff1 = i-l1;
373           Petsc_memcpy((arr)+(k-diff1+1)*size, (tarr)+(l1+1)*size, diff1*size);
374           k -= diff1;
375           i = l1;
376           if (i < 0 || j < left) break;
377         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
378         ++MIN_GALLOP_GLOBAL;
379       }
380     }
381   }
382   if (i >= 0) {Petsc_memcpy((arr)+left*size, tarr, (i+1)*size);}
383   PetscFunctionReturn(0);
384 }
385 
386 static inline PetscErrorCode PetscTimSortMergeHiWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
387 {
388   PetscInt       i = right-mid, j = mid-1, k = right, gallopleft = 0, gallopright = 0;
389   const PetscInt rlen = right-mid+1;
390 
391   PetscFunctionBegin;
392   Petsc_memcpy2(atarr, (arr)+mid*asize, rlen*asize, btarr, (barr)+mid*bsize, rlen*bsize);
393   while ((i >= 0) && (j >= left)) {
394     if ((*cmp)((atarr)+i*asize, (arr)+j*asize, ctx) > 0) {
395       Petsc_memcpy2((arr)+k*asize, (atarr)+i*asize, asize, (barr)+k*bsize, (btarr)+i*bsize, bsize);
396       --k;
397       gallopleft = 0;
398       if (--i >= 0 && ++gallopright >= MIN_GALLOP_GLOBAL) {
399         PetscInt l1, l2, diff1, diff2;
400         do {
401           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
402           /* search temp for left[j], can copy up to that many of temp into arr */
403           PetscCall(PetscGallopSearchRight_Private(atarr, asize, cmp, ctx, 0, i, (arr)+j*asize, &l1));
404           diff1 = i-l1;
405           Petsc_memcpy2((arr)+(k-diff1+1)*asize, (atarr)+(l1+1)*asize, diff1*asize, (barr)+(k-diff1+1)*bsize, (btarr)+(l1+1)*bsize, diff1*bsize);
406           k -= diff1;
407           i = l1;
408           /* search left for temp[i], can move up to that many of left up arr */
409           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, left, j, (atarr)+i*asize, &l2));
410           diff2 = j-l2;
411           Petsc_memmove2((arr)+(k-diff2+1)*asize, (arr)+(l2+1)*asize, diff2*asize, (barr)+(k-diff2+1)*bsize, (barr)+(l2+1)*bsize, diff2*bsize);
412           k -= diff2;
413           j = l2;
414           if (i < 0 || j < left) break;
415         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
416         ++MIN_GALLOP_GLOBAL;
417       }
418     } else {
419       Petsc_memmove2((arr)+k*asize, (arr)+j*asize, asize, (barr)+k*bsize, (barr)+j*bsize, bsize);
420       --k;
421       gallopright = 0;
422       if (--j >= left && ++gallopleft >= MIN_GALLOP_GLOBAL) {
423         PetscInt l1, l2, diff1, diff2;
424         do {
425           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
426           /* search left for temp[i], can move up to that many of left up arr */
427           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, left, j, (atarr)+i*asize, &l2));
428           diff2 = j-l2;
429           Petsc_memmove2((arr)+(k-diff2+1)*asize, (arr)+(l2+1)*asize, diff2*asize, (barr)+(k-diff2+1)*bsize, (barr)+(l2+1)*bsize, diff2*bsize);
430           k -= diff2;
431           j = l2;
432           /* search temp for left[j], can copy up to that many of temp into arr */
433           PetscCall(PetscGallopSearchRight_Private(atarr, asize, cmp, ctx, 0, i, (arr)+j*asize, &l1));
434           diff1 = i-l1;
435           Petsc_memcpy2((arr)+(k-diff1+1)*asize, (atarr)+(l1+1)*asize, diff1*asize, (barr)+(k-diff1+1)*bsize, (btarr)+(l1+1)*bsize, diff1*bsize);
436           k -= diff1;
437           i = l1;
438           if (i < 0 || j < left) break;
439         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
440         ++MIN_GALLOP_GLOBAL;
441       }
442     }
443   }
444   if (i >= 0) {Petsc_memcpy2((arr)+left*asize, atarr, (i+1)*asize, (barr)+left*bsize, btarr, (i+1)*bsize);}
445   PetscFunctionReturn(0);
446 }
447 
448 /* Left is inclusive lower bound of array slice, start is start location of unsorted section, right is inclusive upper
449  bound of array slice. If unsure of where unsorted section starts or if entire length is unsorted pass start = left */
450 static inline PetscErrorCode PetscInsertionSort_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
451 {
452   PetscInt i = start == left ? start+1 : start;
453 
454   PetscFunctionBegin;
455   for (; i <= right; ++i) {
456     PetscInt j = i-1;
457     Petsc_memcpy(tarr, arr+(i*size), size);
458     while ((j >= left) && ((*cmp)(tarr, (arr)+j*size, ctx) < 0)) {
459       COPYSWAPPY(arr+(j+1)*size, arr+j*size, tarr+size, size);
460       --j;
461     }
462     Petsc_memcpy((arr)+(j+1)*size, tarr, size);
463   }
464   PetscFunctionReturn(0);
465 }
466 
467 static inline PetscErrorCode PetscInsertionSortWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
468 {
469   PetscInt i = start == left ? start+1 : start;
470 
471   PetscFunctionBegin;
472   for (; i <= right; ++i) {
473     PetscInt j = i-1;
474     Petsc_memcpy2(atarr, arr+(i*asize), asize, btarr, barr+(i*bsize), bsize);
475     while ((j >= left) && ((*cmp)(atarr, arr+(j*asize), ctx) < 0)) {
476       COPYSWAPPY2(arr+(j+1)*asize, arr+j*asize, asize, barr+(j+1)*bsize, barr+j*bsize, bsize, atarr+asize);
477       --j;
478     }
479     Petsc_memcpy2(arr+(j+1)*asize, atarr, asize, barr+(j+1)*bsize, btarr, bsize);
480   }
481   PetscFunctionReturn(0);
482 }
483 
484 /* See PetscInsertionSort_Private */
485 static inline PetscErrorCode PetscBinaryInsertionSort_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
486 {
487   PetscInt i = start == left ? start+1 : start;
488 
489   PetscFunctionBegin;
490   for (; i <= right; ++i) {
491     PetscInt l = left, r = i;
492     Petsc_memcpy(tarr, arr+(i*size), size);
493     do {
494       const PetscInt m = l + ((r - l) >> 1);
495       if ((*cmp)(tarr, arr+(m*size), ctx) < 0) {
496         r = m;
497       } else {
498         l = m + 1;
499       }
500     } while (l < r);
501     Petsc_memmove(arr+((l+1)*size), arr+(l*size), (i-l)*size);
502     Petsc_memcpy(arr+(l*size), tarr, size);
503   }
504   PetscFunctionReturn(0);
505 }
506 
507 static inline PetscErrorCode PetscBinaryInsertionSortWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
508 {
509   PetscInt i = start == left ? start+1 : start;
510 
511   PetscFunctionBegin;
512   for (; i <= right; ++i) {
513     PetscInt l = left, r = i;
514     Petsc_memcpy2(atarr, arr+(i*asize), asize, btarr, barr+(i*bsize), bsize);
515     do {
516       const PetscInt m = l + ((r - l) >> 1);
517       if ((*cmp)(atarr, arr+(m*asize), ctx) < 0) {
518         r = m;
519       } else {
520         l = m + 1;
521       }
522     } while (l < r);
523     Petsc_memmove2(arr+((l+1)*asize), arr+(l*asize), (i-l)*asize, barr+((l+1)*bsize), barr+(l*bsize), (i-l)*bsize);
524     Petsc_memcpy2(arr+(l*asize), atarr, asize, barr+(l*bsize), btarr, bsize);
525   }
526   PetscFunctionReturn(0);
527 }
528 
529 typedef struct {
530   PetscInt size;
531   PetscInt start;
532 #if defined(__CRAY_AARCH64) /* segfaults with Cray compilers for aarch64 on a64FX */
533 } PetscTimSortStack;
534 #else
535 } PetscTimSortStack PETSC_ATTRIBUTEALIGNED(2*sizeof(PetscInt));
536 #endif
537 
538 typedef struct {
539   char   *ptr PETSC_ATTRIBUTEALIGNED(PETSC_MEMALIGN);
540   size_t size;
541   size_t maxsize;
542 } PetscTimSortBuffer;
543 
544 static inline PetscErrorCode PetscTimSortResizeBuffer_Private(PetscTimSortBuffer *buff, size_t newSize)
545 {
546   PetscFunctionBegin;
547   if (PetscLikely(newSize <= buff->size)) PetscFunctionReturn(0);
548   {
549     /* Can't be larger than n, there is merit to simply allocating buff to n to begin with */
550     size_t         newMax = PetscMin(newSize*newSize, buff->maxsize);
551     PetscCall(PetscFree(buff->ptr));
552     PetscCall(PetscMalloc1(newMax, &buff->ptr));
553     buff->size = newMax;
554   }
555   PetscFunctionReturn(0);
556 }
557 
558 static inline PetscErrorCode PetscTimSortForceCollapse_Private(char *arr, size_t size, CompFunc cmp, void *ctx, PetscTimSortBuffer *buff, PetscTimSortStack *stack, PetscInt stacksize)
559 {
560   PetscFunctionBegin;
561   for (;stacksize; --stacksize) {
562     /* A = stack[i-1], B = stack[i], if A[-1] <= B[0] means sorted */
563     if ((*cmp)(arr+(stack[stacksize].start-1)*size, arr+(stack[stacksize].start)*size, ctx) > 0) {
564       PetscInt       l, m = stack[stacksize].start, r;
565       /* Search A for B[0] insertion */
566       PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[stacksize-1].start, stack[stacksize].start-1, (arr)+(stack[stacksize].start)*size, &l));
567       /* Search B for A[-1] insertion */
568       PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[stacksize].start, stack[stacksize].start+stack[stacksize].size-1, (arr)+(stack[stacksize].start-1)*size, &r));
569       if (m-l <= r-m) {
570         PetscCall(PetscTimSortResizeBuffer_Private(buff, (m-l+1)*size));
571         PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
572       } else {
573         PetscCall(PetscTimSortResizeBuffer_Private(buff, (r-m+1)*size));
574         PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
575       }
576     }
577     /* Update A with merge */
578     stack[stacksize-1].size += stack[stacksize].size;
579   }
580   PetscFunctionReturn(0);
581 }
582 
583 static inline PetscErrorCode PetscTimSortForceCollapseWithArray_Private(char *arr, size_t asize, char *barr, size_t bsize, CompFunc cmp, void *ctx, PetscTimSortBuffer *abuff, PetscTimSortBuffer *bbuff, PetscTimSortStack *stack, PetscInt stacksize)
584 {
585   PetscFunctionBegin;
586   for (;stacksize; --stacksize) {
587     /* A = stack[i-1], B = stack[i], if A[-1] <= B[0] means sorted */
588     if ((*cmp)(arr+(stack[stacksize].start-1)*asize, arr+(stack[stacksize].start)*asize, ctx) > 0) {
589       PetscInt       l, m = stack[stacksize].start, r;
590       /* Search A for B[0] insertion */
591       PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[stacksize-1].start, stack[stacksize].start-1, (arr)+(stack[stacksize].start)*asize, &l));
592       /* Search B for A[-1] insertion */
593       PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[stacksize].start, stack[stacksize].start+stack[stacksize].size-1, (arr)+(stack[stacksize].start-1)*asize, &r));
594       if (m-l <= r-m) {
595         PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m-l+1)*asize));
596         PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m-l+1)*bsize));
597         PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
598       } else {
599         PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r-m+1)*asize));
600         PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r-m+1)*bsize));
601         PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
602       }
603     }
604     /* Update A with merge */
605     stack[stacksize-1].size += stack[stacksize].size;
606   }
607   PetscFunctionReturn(0);
608 }
609 
610 static inline PetscErrorCode PetscTimSortMergeCollapse_Private(char *arr, size_t size, CompFunc cmp, void *ctx, PetscTimSortBuffer *buff, PetscTimSortStack *stack, PetscInt *stacksize)
611 {
612   PetscInt       i = *stacksize;
613 
614   PetscFunctionBegin;
615   while (i) {
616     PetscInt l, m, r, itemp = i;
617 
618     if (i == 1) {
619       /* A = stack[i-1], B = stack[i] */
620       if (stack[i-1].size < stack[i].size) {
621         /* if A[-1] <= B[0] then sorted */
622         if ((*cmp)(arr+(stack[i].start-1)*size, arr+(stack[i].start)*size, ctx) > 0) {
623           m = stack[i].start;
624           /* Search A for B[0] insertion */
625           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i-1].start, stack[i].start-1, (arr)+stack[i].start*size, &l));
626           /* Search B for A[-1] insertion */
627           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i].start, stack[i].start+stack[i].size-1, arr+(stack[i].start-1)*size, &r));
628           if (m-l <= r-m) {
629             PetscCall(PetscTimSortResizeBuffer_Private(buff, (m-l+1)*size));
630             PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
631           } else {
632             PetscCall(PetscTimSortResizeBuffer_Private(buff, (r-m+1)*size));
633             PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
634           }
635         }
636         /* Update A with merge */
637         stack[i-1].size += stack[i].size;
638         --i;
639       }
640     } else {
641       /* i > 2, i.e. C exists
642        A = stack[i-2], B = stack[i-1], C = stack[i]; */
643       if (stack[i-2].size <= stack[i-1].size+stack[i].size) {
644         if (stack[i-2].size < stack[i].size) {
645           /* merge B into A, but if A[-1] <= B[0] then already sorted */
646           if ((*cmp)(arr+(stack[i-1].start-1)*size, arr+(stack[i-1].start)*size, ctx) > 0) {
647             m = stack[i-1].start;
648             /* Search A for B[0] insertion */
649             PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i-2].start, stack[i-1].start-1, (arr)+(stack[i-1].start)*size, &l));
650             /* Search B for A[-1] insertion */
651             PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i-1].start, stack[i-1].start+stack[i-1].size-1, (arr)+(stack[i-1].start-1)*size, &r));
652             if (m-l <= r-m) {
653               PetscCall(PetscTimSortResizeBuffer_Private(buff, (m-l+1)*size));
654               PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
655             } else {
656               PetscCall(PetscTimSortResizeBuffer_Private(buff, (r-m+1)*size));
657               PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
658             }
659           }
660           /* Update A with merge */
661           stack[i-2].size += stack[i-1].size;
662           /* Push C up the stack */
663           stack[i-1].start = stack[i].start;
664           stack[i-1].size = stack[i].size;
665         } else {
666           /* merge C into B */
667           mergeBC:
668           /* If B[-1] <= C[0] then... you know the drill */
669           if ((*cmp)(arr+(stack[i].start-1)*size, arr+(stack[i].start)*size, ctx) > 0) {
670             m = stack[i].start;
671             /* Search B for C[0] insertion */
672             PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i-1].start, stack[i].start-1, arr+stack[i].start*size, &l));
673             /* Search C for B[-1] insertion */
674             PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i].start, stack[i].start+stack[i].size-1, (arr)+(stack[i].start-1)*size, &r));
675             if (m-l <= r-m) {
676               PetscCall(PetscTimSortResizeBuffer_Private(buff, (m-l+1)*size));
677               PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
678             } else {
679               PetscCall(PetscTimSortResizeBuffer_Private(buff, (r-m+1)*size));
680               PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
681             }
682           }
683           /* Update B with merge */
684           stack[i-1].size += stack[i].size;
685         }
686         --i;
687       } else if (stack[i-1].size <= stack[i].size) {
688         /* merge C into B */
689         goto mergeBC;
690       }
691     }
692     if (itemp == i) break;
693   }
694   *stacksize = i;
695   PetscFunctionReturn(0);
696 }
697 
698 static inline PetscErrorCode PetscTimSortMergeCollapseWithArray_Private(char *arr, size_t asize, char *barr, size_t bsize, CompFunc cmp, void *ctx, PetscTimSortBuffer *abuff, PetscTimSortBuffer *bbuff, PetscTimSortStack *stack, PetscInt *stacksize)
699 {
700   PetscInt       i = *stacksize;
701 
702   PetscFunctionBegin;
703   while (i) {
704     PetscInt l, m, r, itemp = i;
705 
706     if (i == 1) {
707       /* A = stack[i-1], B = stack[i] */
708       if (stack[i-1].size < stack[i].size) {
709         /* if A[-1] <= B[0] then sorted */
710         if ((*cmp)(arr+(stack[i].start-1)*asize, arr+(stack[i].start)*asize, ctx) > 0) {
711           m = stack[i].start;
712           /* Search A for B[0] insertion */
713           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i-1].start, stack[i].start-1, (arr)+stack[i].start*asize, &l));
714           /* Search B for A[-1] insertion */
715           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i].start, stack[i].start+stack[i].size-1, arr+(stack[i].start-1)*asize, &r));
716           if (m-l <= r-m) {
717             PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m-l+1)*asize));
718             PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m-l+1)*bsize));
719             PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
720           } else {
721             PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r-m+1)*asize));
722             PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r-m+1)*bsize));
723             PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
724           }
725         }
726         /* Update A with merge */
727         stack[i-1].size += stack[i].size;
728         --i;
729       }
730     } else {
731       /* i > 2, i.e. C exists
732        A = stack[i-2], B = stack[i-1], C = stack[i]; */
733       if (stack[i-2].size <= stack[i-1].size+stack[i].size) {
734         if (stack[i-2].size < stack[i].size) {
735           /* merge B into A, but if A[-1] <= B[0] then already sorted */
736           if ((*cmp)(arr+(stack[i-1].start-1)*asize, arr+(stack[i-1].start)*asize, ctx) > 0) {
737             m = stack[i-1].start;
738             /* Search A for B[0] insertion */
739             PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i-2].start, stack[i-1].start-1, (arr)+(stack[i-1].start)*asize, &l));
740             /* Search B for A[-1] insertion */
741             PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i-1].start, stack[i-1].start+stack[i-1].size-1, (arr)+(stack[i-1].start-1)*asize, &r));
742             if (m-l <= r-m) {
743               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m-l+1)*asize));
744               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m-l+1)*bsize));
745               PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
746             } else {
747               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r-m+1)*asize));
748               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r-m+1)*bsize));
749               PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
750             }
751           }
752           /* Update A with merge */
753           stack[i-2].size += stack[i-1].size;
754           /* Push C up the stack */
755           stack[i-1].start = stack[i].start;
756           stack[i-1].size = stack[i].size;
757         } else {
758           /* merge C into B */
759           mergeBC:
760           /* If B[-1] <= C[0] then... you know the drill */
761           if ((*cmp)(arr+(stack[i].start-1)*asize, arr+(stack[i].start)*asize, ctx) > 0) {
762             m = stack[i].start;
763             /* Search B for C[0] insertion */
764             PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i-1].start, stack[i].start-1, arr+stack[i].start*asize, &l));
765             /* Search C for B[-1] insertion */
766             PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i].start, stack[i].start+stack[i].size-1, (arr)+(stack[i].start-1)*asize, &r));
767             if (m-l <= r-m) {
768               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m-l+1)*asize));
769               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m-l+1)*bsize));
770               PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
771             } else {
772               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r-m+1)*asize));
773               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r-m+1)*bsize));
774               PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
775             }
776           }
777           /* Update B with merge */
778           stack[i-1].size += stack[i].size;
779         }
780         --i;
781       } else if (stack[i-1].size <= stack[i].size) {
782         /* merge C into B */
783         goto mergeBC;
784       }
785     }
786     if (itemp == i) break;
787   }
788   *stacksize = i;
789   PetscFunctionReturn(0);
790 }
791 
792 /* March sequentially through the array building up a "run" of weakly increasing or strictly decreasing contiguous
793  elements. Decreasing runs are reversed by swapping. If the run is less than minrun, artificially extend it via either
794  binary insertion sort or regulat insertion sort */
795 static inline PetscErrorCode PetscTimSortBuildRun_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt n, PetscInt minrun, PetscInt runstart, PetscInt *runend)
796 {
797   const PetscInt re = PetscMin(runstart+minrun, n-1);
798   PetscInt       ri = runstart;
799 
800   PetscFunctionBegin;
801   if (PetscUnlikely(runstart == n-1)) {*runend = runstart; PetscFunctionReturn(0);}
802   /* guess whether run is ascending or descending and tally up the longest consecutive run. essentially a coinflip for random data */
803   if ((*cmp)((arr)+(ri+1)*size, (arr)+ri*size, ctx) < 0) {
804     ++ri;
805     while (ri < n-1) {
806       if ((*cmp)((arr)+(ri+1)*size, (arr)+ri*size, ctx) >= 0) break;
807       ++ri;
808     }
809     {
810       PetscInt lo = runstart, hi = ri;
811       for (; lo < hi; ++lo, --hi) {
812         COPYSWAPPY(arr+lo*size, arr+hi*size, tarr, size);
813       }
814     }
815   } else {
816     ++ri;
817     while (ri < n-1) {
818       if ((*cmp)((arr)+(ri+1)*size, (arr)+ri*size, ctx) < 0) break;
819       ++ri;
820     }
821   }
822   if (ri < re) {
823     /* the attempt failed, this section likely contains random data. If ri got close to minrun (within 50%) then we try
824      binary search */
825     if (ri-runstart <= minrun >> 1) {
826       ++MIN_GALLOP_GLOBAL; /* didn't get close hedge our bets against random data */
827       PetscInsertionSort_Private(arr, tarr, size, cmp, ctx, runstart, ri, re);
828     } else {
829       PetscBinaryInsertionSort_Private(arr, tarr, size, cmp, ctx, runstart, ri, re);
830     }
831     *runend = re;
832   } else *runend = ri;
833   PetscFunctionReturn(0);
834 }
835 
836 static inline PetscErrorCode PetscTimSortBuildRunWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt n, PetscInt minrun, PetscInt runstart, PetscInt *runend)
837 {
838   const PetscInt re = PetscMin(runstart+minrun, n-1);
839   PetscInt       ri = runstart;
840 
841   PetscFunctionBegin;
842   if (PetscUnlikely(runstart == n-1)) {*runend = runstart; PetscFunctionReturn(0);}
843   /* guess whether run is ascending or descending and tally up the longest consecutive run. essentially a coinflip for random data */
844   if ((*cmp)((arr)+(ri+1)*asize, arr+(ri*asize), ctx) < 0) {
845     ++ri;
846     while (ri < n-1) {
847       if ((*cmp)((arr)+(ri+1)*asize, (arr)+ri*asize, ctx) >= 0) break;
848       ++ri;
849     }
850     {
851       PetscInt lo = runstart, hi = ri;
852       for (; lo < hi; ++lo, --hi) {
853         COPYSWAPPY2(arr+lo*asize, arr+hi*asize, asize, barr+lo*bsize, barr+hi*bsize, bsize, atarr);
854       }
855     }
856   } else {
857     ++ri;
858     while (ri < n-1) {
859       if ((*cmp)((arr)+(ri+1)*asize, (arr)+ri*asize, ctx) < 0) break;
860       ++ri;
861     }
862   }
863   if (ri < re) {
864     /* the attempt failed, this section likely contains random data. If ri got close to minrun (within 50%) then we try
865      binary search */
866     if (ri-runstart <= minrun >> 1) {
867       ++MIN_GALLOP_GLOBAL; /* didn't get close hedge our bets against random data */
868       PetscInsertionSortWithArray_Private(arr, atarr, asize, barr, btarr, bsize, cmp, ctx, runstart, ri, re);
869     } else {
870       PetscBinaryInsertionSortWithArray_Private(arr, atarr, asize, barr, btarr, bsize, cmp, ctx, runstart, ri, re);
871     }
872     *runend = re;
873   } else *runend = ri;
874   PetscFunctionReturn(0);
875 }
876 
877 /*@C
878   PetscTimSort - Sorts an array in place in increasing order using Tim Peters adaptive sorting algorithm.
879 
880   Not Collective
881 
882   Input Parameters:
883 + n    - number of values
884 . arr  - array to be sorted
885 . size - size in bytes of the datatype held in arr
886 . cmp  - function pointer to comparison function
887 - ctx  - optional context to be passed to comparison function, NULL if not needed
888 
889   Output Parameters:
890 . arr  - sorted array
891 
892   Notes:
893   Timsort makes the assumption that input data is already likely partially ordered, or that it contains contiguous
894  sections (termed 'runs') where the data is locally ordered (but not necessarily globally ordered). It therefore aims to
895  select slices of the array in such a way that resulting mergesorts operate on near perfectly length-balanced arrays. To
896  do so it repeatedly triggers attempts throughout to merge adjacent runs.
897 
898   Should one run continuously "win" a comparison the algorithm begins the "gallop" phase. It will aggressively search
899   the "winner" for the location of the "losers" next entry (and vice versa) to copy all preceding elements into place in
900   bulk. However if the data is truly unordered (as is the case with random data) the immense gains possible from these
901   searches are expected __not__ to repay their costs. While adjacent arrays are almost all nearly the same size, they
902   likely all contain similar data.
903 
904   Sample usage:
905   The comparison function must follow the qsort() comparison function paradigm, returning the sign of the difference
906   between its arguments. If left < right : return -1, if left == right : return 0, if left > right : return 1. The user
907   may also
908  change or reverse the order of the sort by flipping the above. Note that stability of the sort is only guaranteed if
909  the comparison function forms a valid trigraph. For example when sorting an array of type "my_type" in increasing
910   order
911 
912 .vb
913   int my_increasing_comparison_function(const void *left, const void *right, void *ctx) {
914     my_type l = *(my_type *) left, r = *(my_type *) right;
915     return (l < r) ? -1 : (l > r);
916   }
917 .ve
918   Note the context is unused here but you may use it to pass and subsequently access whatever information required
919   inside the comparison function. The context pointer will unaltered except for any changes made inside the comparison function.
920   Then pass the function
921 .vb
922   PetscTimSort(n, arr, sizeof(arr[0]), my_increasing_comparison_function, ctx)
923 .ve
924 
925   Fortran Notes:
926   To use this from fortran you must write a comparison subroutine with 4 arguments which accepts left, right, ctx and
927   returns result. For example
928 .vb
929  subroutine CompareIntegers(left,right,ctx,result)
930    implicit none
931 
932    PetscInt,intent(in) :: left, right
933    type(UserCtx)       :: ctx
934    integer,intent(out) :: result
935 
936    if (left < right) then
937      result = -1
938    else if (left == right) then
939      result = 0
940    else
941      result = 1
942    end if
943    return
944  end subroutine CompareIntegers
945 .ve
946 
947   References:
948 . * - Tim Peters. https://bugs.python.org/file4451/timsort.txt
949 
950   Level: developer
951 
952 .seealso: PetscTimSortWithArray(), PetscIntSortSemiOrdered(), PetscRealSortSemiOrdered(), PetscMPIIntSortSemiOrdered()
953 @*/
954 PetscErrorCode PetscTimSort(PetscInt n, void *arr, size_t size, int (*cmp)(const void *, const void *, void *), void *ctx)
955 {
956   PetscInt           stacksize = 0, minrun, runstart = 0, runend = 0;
957   PetscTimSortStack  runstack[128];
958   PetscTimSortBuffer buff;
959   /* stacksize  = log_phi(n) = log_2(n)/log_2(phi), so 128 is enough for ~5.614e26 elements.
960    It is so unlikely that this limit is reached that this is __never__ checked for */
961 
962   PetscFunctionBegin;
963   /* Compute minrun. Minrun should be (32, 65) such that N/minrun
964    is a power of 2 or one plus a power of 2 */
965   {
966     PetscInt t = n, r = 0;
967     /* r becomes 1 if the least significant bits contain at least one off bit */
968     while (t >= 64) {
969       r |= t & 1;
970       t >>= 1;
971     }
972     minrun = t + r;
973   }
974   if (PetscDefined(USE_DEBUG)) {
975     PetscCall(PetscInfo(NULL, "minrun = %" PetscInt_FMT "\n", minrun));
976     if (n < 64) {
977       PetscCall(PetscInfo(NULL, "n %" PetscInt_FMT " < 64, consider using PetscSortInt() instead\n", n));
978     } else PetscCheck(!(minrun < 32) && !(minrun > 65),PETSC_COMM_SELF,PETSC_ERR_PLIB,"Calculated minrun %" PetscInt_FMT " not in range (32,65)",minrun);
979   }
980   PetscCall(PetscMalloc1((size_t) minrun*size, &buff.ptr));
981   buff.size = (size_t) minrun*size;
982   buff.maxsize = (size_t) n*size;
983   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
984   while (runstart < n) {
985     /* Check if additional entries are at least partially ordered and build natural run */
986     PetscCall(PetscTimSortBuildRun_Private((char *)arr, buff.ptr, size, cmp, ctx, n, minrun, runstart, &runend));
987     runstack[stacksize].start = runstart;
988     runstack[stacksize].size = runend-runstart+1;
989     PetscCall(PetscTimSortMergeCollapse_Private((char *)arr, size, cmp, ctx, &buff, runstack, &stacksize));
990     ++stacksize;
991     runstart = runend+1;
992   }
993   /* Have been inside while, so discard last stacksize++ */
994   --stacksize;
995   PetscCall(PetscTimSortForceCollapse_Private((char *)arr, size, cmp, ctx, &buff, runstack, stacksize));
996   PetscCall(PetscFree(buff.ptr));
997   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
998   PetscFunctionReturn(0);
999 }
1000 
1001 /*@C
1002   PetscTimSortWithArray - Sorts an array in place in increasing order using Tim Peters adaptive sorting algorithm and
1003   reorders a second array to match the first. The arrays need not be the same type.
1004 
1005   Not Collective
1006 
1007   Input Parameters:
1008 + n     - number of values
1009 . asize - size in bytes of the datatype held in arr
1010 . bsize - size in bytes of the datatype held in barr
1011 . cmp   - function pointer to comparison function
1012 - ctx   - optional context to be passed to comparison function, NULL if not needed
1013 
1014   Input/Output Parameters:
1015 + arr  - array to be sorted, on output it is sorted
1016 - barr - array to be reordered, on output it is reordered
1017 
1018   Notes:
1019   The arrays need not be of the same type, however barr MUST contain at least as many elements as arr and the two CANNOT
1020   overlap.
1021 
1022   Timsort makes the assumption that input data is already likely partially ordered, or that it contains contiguous
1023   sections (termed 'runs') where the data is locally ordered (but not necessarily globally ordered). It therefore aims
1024  to select slices of the array in such a way that resulting mergesorts operate on near perfectly length-balanced
1025  arrays. To do so it repeatedly triggers attempts throughout to merge adjacent runs.
1026 
1027   Should one run continuously "win" a comparison the algorithm begins the "gallop" phase. It will aggressively search
1028   the "winner" for the location of the "losers" next entry (and vice versa) to copy all preceding elements into place in
1029   bulk. However if the data is truly unordered (as is the case with random data) the immense gains possible from these
1030   searches are expected __not__ to repay their costs. While adjacent arrays are almost all nearly the same size, they
1031   likely all contain similar data.
1032 
1033   Sample usage:
1034   The comparison function must follow the qsort() comparison function paradigm, returning the sign of the difference
1035   between its arguments. If left < right : return -1, if left == right : return 0, if left > right : return 1. The user
1036   may also change or reverse the order of the sort by flipping the above. Note that stability of the sort is only
1037   guaranteed if the comparison function forms a valid trigraph. For example when sorting an array of type "my_type" in
1038   increasing order
1039 
1040 .vb
1041   int my_increasing_comparison_function(const void *left, const void *right, void *ctx) {
1042     my_type l = *(my_type *) left, r = *(my_type *) right;
1043     return (l < r) ? -1 : (l > r);
1044   }
1045 .ve
1046   Note the context is unused here but you may use it to pass and subsequently access whatever information required
1047   inside the comparison function. The context pointer will unaltered except for any changes made inside the comparison function.
1048   Then pass the function
1049 .vb
1050   PetscTimSortWithArray(n, arr, sizeof(arr[0]), barr, sizeof(barr[0]), my_increasing_comparison_function, ctx)
1051 .ve
1052 
1053   Fortran Notes:
1054   To use this from fortran you must write a comparison subroutine with 4 arguments which accepts left, right, ctx and
1055   returns result. For example
1056 .vb
1057  subroutine CompareIntegers(left,right,ctx,result)
1058    implicit none
1059 
1060    PetscInt,intent(in) :: left, right
1061    type(UserCtx)       :: ctx
1062    integer,intent(out) :: result
1063 
1064    if (left < right) then
1065      result = -1
1066    else if (left == right) then
1067      result = 0
1068    else
1069      result = 1
1070    end if
1071    return
1072  end subroutine CompareIntegers
1073 .ve
1074 
1075   References:
1076 . * - Tim Peters. https://bugs.python.org/file4451/timsort.txt
1077 
1078   Level: developer
1079 
1080 .seealso: PetscTimSort(), PetscIntSortSemiOrderedWithArray(), PetscRealSortSemiOrderedWithArrayInt(), PetscMPIIntSortSemiOrderedWithArray()
1081 @*/
1082 PetscErrorCode PetscTimSortWithArray(PetscInt n, void *arr, size_t asize, void *barr, size_t bsize, int (*cmp)(const void *, const void *, void *), void *ctx)
1083 {
1084   PetscInt           stacksize = 0, minrun, runstart = 0, runend = 0;
1085   PetscTimSortStack  runstack[128];
1086   PetscTimSortBuffer abuff, bbuff;
1087   /* stacksize  = log_phi(n) = log_2(n)/log_2(phi), so 128 is enough for ~5.614e26 elements.
1088    It is so unlikely that this limit is reached that this is __never__ checked for */
1089 
1090   PetscFunctionBegin;
1091   /* Compute minrun. Minrun should be (32, 65) such that N/minrun
1092    is a power of 2 or one plus a power of 2 */
1093   {
1094     PetscInt t = n, r = 0;
1095     /* r becomes 1 if the least significant bits contain at least one off bit */
1096     while (t >= 64) {
1097       r |= t & 1;
1098       t >>= 1;
1099     }
1100     minrun = t + r;
1101   }
1102   if (PetscDefined(USE_DEBUG)) {
1103     PetscCall(PetscInfo(NULL, "minrun = %" PetscInt_FMT "\n", minrun));
1104     PetscCheckFalse((n >= 64) && ((minrun < 32) || (minrun > 65)),PETSC_COMM_SELF,PETSC_ERR_PLIB,"Calculated minrun %" PetscInt_FMT " not in range (32,65)",minrun);
1105   }
1106   PetscCall(PetscMalloc1((size_t) minrun*asize, &abuff.ptr));
1107   abuff.size = (size_t) minrun*asize;
1108   abuff.maxsize = (size_t) n*asize;
1109   PetscCall(PetscMalloc1((size_t) minrun*bsize, &bbuff.ptr));
1110   bbuff.size = (size_t) minrun*bsize;
1111   bbuff.maxsize = (size_t) n*bsize;
1112   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
1113   while (runstart < n) {
1114     /* Check if additional entries are at least partially ordered and build natural run */
1115     PetscCall(PetscTimSortBuildRunWithArray_Private((char *)arr, abuff.ptr, asize, (char *)barr, bbuff.ptr, bsize, cmp, ctx, n, minrun, runstart, &runend));
1116     runstack[stacksize].start = runstart;
1117     runstack[stacksize].size = runend-runstart+1;
1118     PetscCall(PetscTimSortMergeCollapseWithArray_Private((char *)arr, asize, (char *)barr, bsize, cmp, ctx, &abuff, &bbuff, runstack, &stacksize));
1119     ++stacksize;
1120     runstart = runend+1;
1121   }
1122   /* Have been inside while, so discard last stacksize++ */
1123   --stacksize;
1124   PetscCall(PetscTimSortForceCollapseWithArray_Private((char *)arr, asize, (char *)barr, bsize, cmp, ctx, &abuff, &bbuff, runstack, stacksize));
1125   PetscCall(PetscFree(abuff.ptr));
1126   PetscCall(PetscFree(bbuff.ptr));
1127   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
1128   PetscFunctionReturn(0);
1129 }
1130 
1131 /*@
1132    PetscIntSortSemiOrdered - Sorts an array of integers in place in increasing order.
1133 
1134    Not Collective
1135 
1136    Input Parameters:
1137 +  n   - number of values
1138 -  arr - array of integers
1139 
1140    Output Parameters:
1141 .  arr - sorted array of integers
1142 
1143    Notes:
1144    If the array is less than 64 entries long PetscSortInt() is automatically used.
1145 
1146    This function serves as an alternative to PetscSortInt(). While this function works for any array of integers it is
1147    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1148    recommended that the user benchmark their code to see which routine is fastest.
1149 
1150    Level: intermediate
1151 
1152 .seealso: PetscTimSort(), PetscSortInt(), PetscSortIntWithPermutation()
1153 @*/
1154 PetscErrorCode PetscIntSortSemiOrdered(PetscInt n, PetscInt arr[])
1155 {
1156   PetscFunctionBegin;
1157   if (n <= 1) PetscFunctionReturn(0);
1158   PetscValidIntPointer(arr,2);
1159   if (n < 64) {
1160     PetscCall(PetscSortInt(n, arr));
1161   } else {
1162     PetscCall(PetscTimSort(n, arr, sizeof(PetscInt), Compare_PetscInt_Private, NULL));
1163   }
1164   PetscFunctionReturn(0);
1165 }
1166 
1167 /*@
1168    PetscIntSortSemiOrderedWithArray - Sorts an array of integers in place in increasing order and reorders a second
1169    array to match the first.
1170 
1171    Not Collective
1172 
1173    Input Parameter:
1174 .  n   - number of values
1175 
1176    Input/Output Parameters:
1177 +  arr1 - array of integers to be sorted, modified on output
1178 -  arr2 - array of integers to be reordered, modified on output
1179 
1180    Notes:
1181    The arrays CANNOT overlap.
1182 
1183    This function serves as an alternative to PetscSortIntWithArray(). While this function works for any array of integers it is
1184    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1185    recommended that the user benchmark their code to see which routine is fastest.
1186 
1187    Level: intermediate
1188 
1189 .seealso: PetscTimSortWithArray(), PetscSortIntWithArray(), PetscSortIntWithPermutation()
1190 @*/
1191 PetscErrorCode PetscIntSortSemiOrderedWithArray(PetscInt n, PetscInt arr1[], PetscInt arr2[])
1192 {
1193   PetscFunctionBegin;
1194   if (n <= 1) PetscFunctionReturn(0);
1195   PetscValidIntPointer(arr1,2);
1196   PetscValidIntPointer(arr2,3);
1197   /* cannot export out to PetscIntSortWithArray here since it isn't stable */
1198   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscInt), arr2, sizeof(PetscInt), Compare_PetscInt_Private, NULL));
1199   PetscFunctionReturn(0);
1200 }
1201 
1202 /*@
1203    PetscMPIIntSortSemiOrdered - Sorts an array of PetscMPIInts in place in increasing order.
1204 
1205    Not Collective
1206 
1207    Input Parameters:
1208 +  n   - number of values
1209 -  arr - array of PetscMPIInts
1210 
1211    Output Parameters:
1212 .  arr - sorted array of integers
1213 
1214    Notes:
1215    If the array is less than 64 entries long PetscSortMPIInt() is automatically used.
1216 
1217    This function serves as an alternative to PetscSortMPIInt(). While this function works for any array of PetscMPIInts it is
1218    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1219    recommended that the user benchmark their code to see which routine is fastest.
1220 
1221    Level: intermediate
1222 
1223 .seealso: PetscTimSort(), PetscSortMPIInt()
1224 @*/
1225 PetscErrorCode PetscMPIIntSortSemiOrdered(PetscInt n, PetscMPIInt arr[])
1226 {
1227   PetscFunctionBegin;
1228   if (n <= 1) PetscFunctionReturn(0);
1229   PetscValidIntPointer(arr,2);
1230   if (n < 64) {
1231     PetscCall(PetscSortMPIInt(n, arr));
1232   } else {
1233     PetscCall(PetscTimSort(n, arr, sizeof(PetscMPIInt), Compare_PetscMPIInt_Private, NULL));
1234   }
1235   PetscFunctionReturn(0);
1236 }
1237 
1238 /*@
1239    PetscMPIIntSortSemiOrderedWithArray - Sorts an array of integers in place in increasing order and reorders a second
1240    array to match the first.
1241 
1242    Not Collective
1243 
1244    Input Parameter:
1245 .  n   - number of values
1246 
1247    Input/Output Parameters:
1248 .  arr1 - array of integers to be sorted, modified on output
1249 -  arr2 - array of integers to be reordered, modified on output
1250 
1251    Notes:
1252    The arrays CANNOT overlap.
1253 
1254    This function serves as an alternative to PetscSortMPIIntWithArray(). While this function works for any array of integers it is
1255    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1256    recommended that the user benchmark their code to see which routine is fastest.
1257 
1258    Level: intermediate
1259 
1260 .seealso: PetscTimSortWithArray(), PetscSortMPIIntWithArray(), PetscSortMPIIntWithPermutation()
1261 @*/
1262 PetscErrorCode PetscMPIIntSortSemiOrderedWithArray(PetscInt n, PetscMPIInt arr1[], PetscMPIInt arr2[])
1263 {
1264   PetscFunctionBegin;
1265   if (n <= 1) PetscFunctionReturn(0);
1266   PetscValidIntPointer(arr1,2);
1267   PetscValidIntPointer(arr2,3);
1268   /* cannot export out to PetscMPIIntSortWithArray here since it isn't stable */
1269   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscMPIInt), arr2, sizeof(PetscMPIInt), Compare_PetscMPIInt_Private, NULL));
1270   PetscFunctionReturn(0);
1271 }
1272 
1273 /*@
1274    PetscRealSortSemiOrdered - Sorts an array of PetscReals in place in increasing order.
1275 
1276    Not Collective
1277 
1278    Input Parameters:
1279 +  n   - number of values
1280 -  arr - array of PetscReals
1281 
1282    Output Parameters:
1283 .  arr - sorted array of integers
1284 
1285    Notes:
1286    If the array is less than 64 entries long PetscSortReal() is automatically used.
1287 
1288    This function serves as an alternative to PetscSortReal(). While this function works for any array of PetscReals it is
1289    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1290    recommended that the user benchmark their code to see which routine is fastest.
1291 
1292    Level: intermediate
1293 
1294 .seealso: PetscTimSort(), PetscSortReal(), PetscSortRealWithPermutation()
1295 @*/
1296 PetscErrorCode PetscRealSortSemiOrdered(PetscInt n, PetscReal arr[])
1297 {
1298   PetscFunctionBegin;
1299   if (n <= 1) PetscFunctionReturn(0);
1300   PetscValidRealPointer(arr,2);
1301   if (n < 64) {
1302     PetscCall(PetscSortReal(n, arr));
1303   } else {
1304     PetscCall(PetscTimSort(n, arr, sizeof(PetscReal), Compare_PetscReal_Private, NULL));
1305   }
1306   PetscFunctionReturn(0);
1307 }
1308 
1309 /*@
1310    PetscRealSortSemiOrderedWithArrayInt - Sorts an array of PetscReals in place in increasing order and reorders a second
1311    array of PetscInts to match the first.
1312 
1313    Not Collective
1314 
1315    Input Parameter:
1316 .  n   - number of values
1317 
1318    Input/Output Parameters:
1319 .  arr1 - array of PetscReals to be sorted, modified on output
1320 -  arr2 - array of PetscReals to be reordered, modified on output
1321 
1322    Notes:
1323    This function serves as an alternative to PetscSortRealWithArray(). While this function works for any array of PetscReals it is
1324    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1325    recommended that the user benchmark their code to see which routine is fastest.
1326 
1327    Level: intermediate
1328 
1329 .seealso: PetscTimSortWithArray(), PetscSortRealWithArrayInt(), PetscSortRealWithPermutation()
1330 @*/
1331 PetscErrorCode PetscRealSortSemiOrderedWithArrayInt(PetscInt n, PetscReal arr1[], PetscInt arr2[])
1332 {
1333   PetscFunctionBegin;
1334   if (n <= 1) PetscFunctionReturn(0);
1335   PetscValidRealPointer(arr1,2);
1336   PetscValidIntPointer(arr2,3);
1337   /* cannot export out to PetscRealSortWithArrayInt here since it isn't stable */
1338   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscReal), arr2, sizeof(PetscInt), Compare_PetscReal_Private, NULL));
1339   PetscFunctionReturn(0);
1340 }
1341