xref: /petsc/src/sys/utils/sortso.c (revision fbf9dbe564678ed6eff1806adbc4c4f01b9743f4)
1 #include <petscsys.h> /*I  "petscsys.h"  I*/
2 #include <petsc/private/petscimpl.h>
3 
4 static inline int Compare_PetscMPIInt_Private(const void *left, const void *right, PETSC_UNUSED void *ctx)
5 {
6   PetscMPIInt l = *(PetscMPIInt *)left, r = *(PetscMPIInt *)right;
7   return (l < r) ? -1 : (l > r);
8 }
9 
10 static inline int Compare_PetscInt_Private(const void *left, const void *right, PETSC_UNUSED void *ctx)
11 {
12   PetscInt l = *(PetscInt *)left, r = *(PetscInt *)right;
13   return (l < r) ? -1 : (l > r);
14 }
15 
16 static inline int Compare_PetscReal_Private(const void *left, const void *right, PETSC_UNUSED void *ctx)
17 {
18   PetscReal l = *(PetscReal *)left, r = *(PetscReal *)right;
19   return (l < r) ? -1 : (l > r);
20 }
21 
22 #define MIN_GALLOP_CONST_GLOBAL 8
23 static PetscInt MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
24 typedef int (*CompFunc)(const void *, const void *, void *);
25 
26 /* Mostly to force clang uses the builtins, but can't hurt to have gcc compilers also do the same */
27 #if !defined(PETSC_USE_DEBUG) && defined(__GNUC__)
28 static inline void COPYSWAPPY(char *a, char *b, char *t, size_t size)
29 {
30   __builtin_memcpy(t, b, size);
31   __builtin_memmove(b, a, size);
32   __builtin_memcpy(a, t, size);
33   return;
34 }
35 
36 static inline void COPYSWAPPY2(char *al, char *ar, size_t asize, char *bl, char *br, size_t bsize, char *t)
37 {
38   __builtin_memcpy(t, ar, asize);
39   __builtin_memmove(ar, al, asize);
40   __builtin_memcpy(al, t, asize);
41   __builtin_memcpy(t, br, bsize);
42   __builtin_memmove(br, bl, bsize);
43   __builtin_memcpy(bl, t, bsize);
44   return;
45 }
46 
47 static inline void Petsc_memcpy(char *dest, const char *src, size_t size)
48 {
49   __builtin_memcpy(dest, src, size);
50   return;
51 }
52 
53 static inline void Petsc_memcpy2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
54 {
55   __builtin_memcpy(adest, asrc, asize);
56   __builtin_memcpy(bdest, bsrc, bsize);
57   return;
58 }
59 
60 static inline void Petsc_memmove(char *dest, const char *src, size_t size)
61 {
62   __builtin_memmove(dest, src, size);
63   return;
64 }
65 
66 static inline void Petsc_memmove2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
67 {
68   __builtin_memmove(adest, asrc, asize);
69   __builtin_memmove(bdest, bsrc, bsize);
70   return;
71 }
72 #else
73 static inline void COPYSWAPPY(char *a, char *b, char *t, size_t size)
74 {
75   PetscFunctionBegin;
76   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(t, b, size));
77   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(b, a, size));
78   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(a, t, size));
79   PetscFunctionReturnVoid();
80 }
81 
82 static inline void COPYSWAPPY2(char *al, char *ar, size_t asize, char *bl, char *br, size_t bsize, char *t)
83 {
84   PetscFunctionBegin;
85   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(t, ar, asize));
86   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(ar, al, asize));
87   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(al, t, asize));
88   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(t, br, bsize));
89   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(br, bl, bsize));
90   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(bl, t, bsize));
91   PetscFunctionReturnVoid();
92 }
93 
94 static inline void Petsc_memcpy(char *dest, const char *src, size_t size)
95 {
96   PetscFunctionBegin;
97   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(dest, src, size));
98   PetscFunctionReturnVoid();
99 }
100 
101 static inline void Petsc_memcpy2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
102 {
103   PetscFunctionBegin;
104   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(adest, asrc, asize));
105   PetscCallAbort(PETSC_COMM_SELF, PetscMemcpy(bdest, bsrc, bsize));
106   PetscFunctionReturnVoid();
107 }
108 
109 static inline void Petsc_memmove(char *dest, const char *src, size_t size)
110 {
111   PetscFunctionBegin;
112   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(dest, src, size));
113   PetscFunctionReturnVoid();
114 }
115 
116 static inline void Petsc_memmove2(char *adest, const char *asrc, size_t asize, char *bdest, const char *bsrc, size_t bsize)
117 {
118   PetscFunctionBegin;
119   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(adest, asrc, asize));
120   PetscCallAbort(PETSC_COMM_SELF, PetscMemmove(bdest, bsrc, bsize));
121   PetscFunctionReturnVoid();
122 }
123 #endif
124 
125 /* Start left look right. Looking for e.g. B[0] in A or mergelo. l inclusive, r inclusive. Returns first m such that arr[m] >
126  x. Output also inclusive.
127 
128  NOTE: Both gallopsearch functions CANNOT distinguish between inserting AFTER the entire array vs inserting at entry n!! For example for an array:
129 
130    {0,1,2,3,4,5}
131 
132    when looking to insert "5" this routine will return *m = 6, but when looking to insert "6" it will ALSO return *m = 6.
133   */
134 static inline PetscErrorCode PetscGallopSearchLeft_Private(const char *arr, size_t size, CompFunc cmp, void *ctx, PetscInt l, PetscInt r, const char *x, PetscInt *m)
135 {
136   PetscInt last = l, k = 1, mid, cur = l + 1;
137 
138   PetscFunctionBegin;
139   *m = l;
140   PetscAssert(r >= l, PETSC_COMM_SELF, PETSC_ERR_PLIB, "r %" PetscInt_FMT " < l %" PetscInt_FMT " in PetscGallopSearchLeft", r, l);
141   if ((*cmp)(x, arr + r * size, ctx) >= 0) {
142     *m = r;
143     PetscFunctionReturn(PETSC_SUCCESS);
144   }
145   if ((*cmp)(x, (arr) + l * size, ctx) < 0 || PetscUnlikely(!(r - l))) PetscFunctionReturn(PETSC_SUCCESS);
146   while (PETSC_TRUE) {
147     if (PetscUnlikely(cur > r)) {
148       cur = r;
149       break;
150     }
151     if ((*cmp)(x, (arr) + cur * size, ctx) < 0) break;
152     last = cur;
153     cur += (k <<= 1) + 1;
154     ++k;
155   }
156   /* standard binary search but take last 0 mid 0 cur 1 into account*/
157   while (cur > last + 1) {
158     mid = last + ((cur - last) >> 1);
159     if ((*cmp)(x, (arr) + mid * size, ctx) < 0) {
160       cur = mid;
161     } else {
162       last = mid;
163     }
164   }
165   *m = cur;
166   PetscFunctionReturn(PETSC_SUCCESS);
167 }
168 
169 /* Start right look left. Looking for e.g. A[-1] in B or mergehi. l inclusive, r inclusive. Returns last m such that arr[m]
170  < x. Output also inclusive */
171 static inline PetscErrorCode PetscGallopSearchRight_Private(const char *arr, size_t size, CompFunc cmp, void *ctx, PetscInt l, PetscInt r, const char *x, PetscInt *m)
172 {
173   PetscInt last = r, k = 1, mid, cur = r - 1;
174 
175   PetscFunctionBegin;
176   *m = r;
177   PetscAssert(r >= l, PETSC_COMM_SELF, PETSC_ERR_PLIB, "r %" PetscInt_FMT " < l %" PetscInt_FMT " in PetscGallopSearchRight", r, l);
178   if ((*cmp)(x, arr + l * size, ctx) <= 0) {
179     *m = l;
180     PetscFunctionReturn(PETSC_SUCCESS);
181   }
182   if ((*cmp)(x, (arr) + r * size, ctx) > 0 || PetscUnlikely(!(r - l))) PetscFunctionReturn(PETSC_SUCCESS);
183   while (PETSC_TRUE) {
184     if (PetscUnlikely(cur < l)) {
185       cur = l;
186       break;
187     }
188     if ((*cmp)(x, (arr) + cur * size, ctx) > 0) break;
189     last = cur;
190     cur -= (k <<= 1) + 1;
191     ++k;
192   }
193   /* standard binary search but take last r-1 mid r-1 cur r-2 into account*/
194   while (last > cur + 1) {
195     mid = last - ((last - cur) >> 1);
196     if ((*cmp)(x, (arr) + mid * size, ctx) > 0) {
197       cur = mid;
198     } else {
199       last = mid;
200     }
201   }
202   *m = cur;
203   PetscFunctionReturn(PETSC_SUCCESS);
204 }
205 
206 /* Mergesort where size of left half <= size of right half, so mergesort is done left to right. Arr should be pointer to
207  complete array, left is first index of left array, mid is first index of right array, right is last index of right
208  array */
209 static inline PetscErrorCode PetscTimSortMergeLo_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
210 {
211   PetscInt       i = 0, j = mid, k = left, gallopleft = 0, gallopright = 0;
212   const PetscInt llen = mid - left;
213 
214   PetscFunctionBegin;
215   Petsc_memcpy(tarr, arr + (left * size), llen * size);
216   while ((i < llen) && (j <= right)) {
217     if ((*cmp)(tarr + (i * size), arr + (j * size), ctx) < 0) {
218       Petsc_memcpy(arr + (k * size), tarr + (i * size), size);
219       ++k;
220       gallopright = 0;
221       if (++i < llen && ++gallopleft >= MIN_GALLOP_GLOBAL) {
222         PetscInt l1, l2, diff1, diff2;
223         do {
224           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
225           /* search temp for right[j], can move up to that of temp into arr immediately */
226           PetscCall(PetscGallopSearchLeft_Private(tarr, size, cmp, ctx, i, llen - 1, arr + (j * size), &l1));
227           diff1 = l1 - i;
228           Petsc_memcpy(arr + (k * size), tarr + (i * size), diff1 * size);
229           k += diff1;
230           i = l1;
231           /* search right for temp[i], can move up to that many of right into arr */
232           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, j, right, tarr + (i * size), &l2));
233           diff2 = l2 - j;
234           Petsc_memmove((arr) + k * size, (arr) + j * size, diff2 * size);
235           k += diff2;
236           j = l2;
237           if (i >= llen || j > right) break;
238         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
239         ++MIN_GALLOP_GLOBAL;
240       }
241     } else {
242       Petsc_memmove(arr + (k * size), arr + (j * size), size);
243       ++k;
244       gallopleft = 0;
245       if (++j <= right && ++gallopright >= MIN_GALLOP_GLOBAL) {
246         PetscInt l1, l2, diff1, diff2;
247         do {
248           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
249           /* search right for temp[i], can move up to that many of right into arr */
250           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, j, right, tarr + (i * size), &l2));
251           diff2 = l2 - j;
252           Petsc_memmove(arr + (k * size), arr + (j * size), diff2 * size);
253           k += diff2;
254           j = l2;
255           /* search temp for right[j], can copy up to that of temp into arr immediately */
256           PetscCall(PetscGallopSearchLeft_Private(tarr, size, cmp, ctx, i, llen - 1, arr + (j * size), &l1));
257           diff1 = l1 - i;
258           Petsc_memcpy(arr + (k * size), tarr + (i * size), diff1 * size);
259           k += diff1;
260           i = l1;
261           if (i >= llen || j > right) break;
262         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
263         ++MIN_GALLOP_GLOBAL;
264       }
265     }
266   }
267   if (i < llen) Petsc_memcpy(arr + (k * size), tarr + (i * size), (llen - i) * size);
268   PetscFunctionReturn(PETSC_SUCCESS);
269 }
270 
271 static inline PetscErrorCode PetscTimSortMergeLoWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
272 {
273   PetscInt       i = 0, j = mid, k = left, gallopleft = 0, gallopright = 0;
274   const PetscInt llen = mid - left;
275 
276   PetscFunctionBegin;
277   Petsc_memcpy2(atarr, arr + (left * asize), llen * asize, btarr, barr + (left * bsize), llen * bsize);
278   while ((i < llen) && (j <= right)) {
279     if ((*cmp)(atarr + (i * asize), arr + (j * asize), ctx) < 0) {
280       Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), asize, barr + (k * bsize), btarr + (i * bsize), bsize);
281       ++k;
282       gallopright = 0;
283       if (++i < llen && ++gallopleft >= MIN_GALLOP_GLOBAL) {
284         PetscInt l1, l2, diff1, diff2;
285         do {
286           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
287           /* search temp for right[j], can move up to that of temp into arr immediately */
288           PetscCall(PetscGallopSearchLeft_Private(atarr, asize, cmp, ctx, i, llen - 1, arr + (j * asize), &l1));
289           diff1 = l1 - i;
290           Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), diff1 * asize, barr + (k * bsize), btarr + (i * bsize), diff1 * bsize);
291           k += diff1;
292           i = l1;
293           /* search right for temp[i], can move up to that many of right into arr */
294           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, j, right, atarr + (i * asize), &l2));
295           diff2 = l2 - j;
296           Petsc_memmove2(arr + (k * asize), arr + (j * asize), diff2 * asize, barr + (k * bsize), barr + (j * bsize), diff2 * bsize);
297           k += diff2;
298           j = l2;
299           if (i >= llen || j > right) break;
300         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
301         ++MIN_GALLOP_GLOBAL;
302       }
303     } else {
304       Petsc_memmove2(arr + (k * asize), arr + (j * asize), asize, barr + (k * bsize), barr + (j * bsize), bsize);
305       ++k;
306       gallopleft = 0;
307       if (++j <= right && ++gallopright >= MIN_GALLOP_GLOBAL) {
308         PetscInt l1, l2, diff1, diff2;
309         do {
310           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
311           /* search right for temp[i], can move up to that many of right into arr */
312           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, j, right, atarr + (i * asize), &l2));
313           diff2 = l2 - j;
314           Petsc_memmove2(arr + (k * asize), arr + (j * asize), diff2 * asize, barr + (k * bsize), barr + (j * bsize), diff2 * bsize);
315           k += diff2;
316           j = l2;
317           /* search temp for right[j], can copy up to that of temp into arr immediately */
318           PetscCall(PetscGallopSearchLeft_Private(atarr, asize, cmp, ctx, i, llen - 1, arr + (j * asize), &l1));
319           diff1 = l1 - i;
320           Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), diff1 * asize, barr + (k * bsize), btarr + (i * bsize), diff1 * bsize);
321           k += diff1;
322           i = l1;
323           if (i >= llen || j > right) break;
324         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
325         ++MIN_GALLOP_GLOBAL;
326       }
327     }
328   }
329   if (i < llen) Petsc_memcpy2(arr + (k * asize), atarr + (i * asize), (llen - i) * asize, barr + (k * bsize), btarr + (i * bsize), (llen - i) * bsize);
330   PetscFunctionReturn(PETSC_SUCCESS);
331 }
332 
333 /* Mergesort where size of right half < size of left half, so mergesort is done right to left. Arr should be pointer to
334  complete array, left is first index of left array, mid is first index of right array, right is last index of right
335  array */
336 static inline PetscErrorCode PetscTimSortMergeHi_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
337 {
338   PetscInt       i = right - mid, j = mid - 1, k = right, gallopleft = 0, gallopright = 0;
339   const PetscInt rlen = right - mid + 1;
340 
341   PetscFunctionBegin;
342   Petsc_memcpy(tarr, (arr) + mid * size, rlen * size);
343   while ((i >= 0) && (j >= left)) {
344     if ((*cmp)((tarr) + i * size, (arr) + j * size, ctx) > 0) {
345       Petsc_memcpy((arr) + k * size, (tarr) + i * size, size);
346       --k;
347       gallopleft = 0;
348       if (--i >= 0 && ++gallopright >= MIN_GALLOP_GLOBAL) {
349         PetscInt l1, l2, diff1, diff2;
350         do {
351           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
352           /* search temp for left[j], can copy up to that many of temp into arr */
353           PetscCall(PetscGallopSearchRight_Private(tarr, size, cmp, ctx, 0, i, (arr) + j * size, &l1));
354           diff1 = i - l1;
355           Petsc_memcpy((arr) + (k - diff1 + 1) * size, (tarr) + (l1 + 1) * size, diff1 * size);
356           k -= diff1;
357           i = l1;
358           /* search left for temp[i], can move up to that many of left up arr */
359           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, left, j, (tarr) + i * size, &l2));
360           diff2 = j - l2;
361           Petsc_memmove((arr) + (k - diff2 + 1) * size, (arr) + (l2 + 1) * size, diff2 * size);
362           k -= diff2;
363           j = l2;
364           if (i < 0 || j < left) break;
365         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
366         ++MIN_GALLOP_GLOBAL;
367       }
368     } else {
369       Petsc_memmove((arr) + k * size, (arr) + j * size, size);
370       --k;
371       gallopright = 0;
372       if (--j >= left && ++gallopleft >= MIN_GALLOP_GLOBAL) {
373         PetscInt l1, l2, diff1, diff2;
374         do {
375           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
376           /* search left for temp[i], can move up to that many of left up arr */
377           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, left, j, (tarr) + i * size, &l2));
378           diff2 = j - l2;
379           Petsc_memmove((arr) + (k - diff2 + 1) * size, (arr) + (l2 + 1) * size, diff2 * size);
380           k -= diff2;
381           j = l2;
382           /* search temp for left[j], can copy up to that many of temp into arr */
383           PetscCall(PetscGallopSearchRight_Private(tarr, size, cmp, ctx, 0, i, (arr) + j * size, &l1));
384           diff1 = i - l1;
385           Petsc_memcpy((arr) + (k - diff1 + 1) * size, (tarr) + (l1 + 1) * size, diff1 * size);
386           k -= diff1;
387           i = l1;
388           if (i < 0 || j < left) break;
389         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
390         ++MIN_GALLOP_GLOBAL;
391       }
392     }
393   }
394   if (i >= 0) Petsc_memcpy((arr) + left * size, tarr, (i + 1) * size);
395   PetscFunctionReturn(PETSC_SUCCESS);
396 }
397 
398 static inline PetscErrorCode PetscTimSortMergeHiWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt mid, PetscInt right)
399 {
400   PetscInt       i = right - mid, j = mid - 1, k = right, gallopleft = 0, gallopright = 0;
401   const PetscInt rlen = right - mid + 1;
402 
403   PetscFunctionBegin;
404   Petsc_memcpy2(atarr, (arr) + mid * asize, rlen * asize, btarr, (barr) + mid * bsize, rlen * bsize);
405   while ((i >= 0) && (j >= left)) {
406     if ((*cmp)((atarr) + i * asize, (arr) + j * asize, ctx) > 0) {
407       Petsc_memcpy2((arr) + k * asize, (atarr) + i * asize, asize, (barr) + k * bsize, (btarr) + i * bsize, bsize);
408       --k;
409       gallopleft = 0;
410       if (--i >= 0 && ++gallopright >= MIN_GALLOP_GLOBAL) {
411         PetscInt l1, l2, diff1, diff2;
412         do {
413           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
414           /* search temp for left[j], can copy up to that many of temp into arr */
415           PetscCall(PetscGallopSearchRight_Private(atarr, asize, cmp, ctx, 0, i, (arr) + j * asize, &l1));
416           diff1 = i - l1;
417           Petsc_memcpy2((arr) + (k - diff1 + 1) * asize, (atarr) + (l1 + 1) * asize, diff1 * asize, (barr) + (k - diff1 + 1) * bsize, (btarr) + (l1 + 1) * bsize, diff1 * bsize);
418           k -= diff1;
419           i = l1;
420           /* search left for temp[i], can move up to that many of left up arr */
421           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, left, j, (atarr) + i * asize, &l2));
422           diff2 = j - l2;
423           Petsc_memmove2((arr) + (k - diff2 + 1) * asize, (arr) + (l2 + 1) * asize, diff2 * asize, (barr) + (k - diff2 + 1) * bsize, (barr) + (l2 + 1) * bsize, diff2 * bsize);
424           k -= diff2;
425           j = l2;
426           if (i < 0 || j < left) break;
427         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
428         ++MIN_GALLOP_GLOBAL;
429       }
430     } else {
431       Petsc_memmove2((arr) + k * asize, (arr) + j * asize, asize, (barr) + k * bsize, (barr) + j * bsize, bsize);
432       --k;
433       gallopright = 0;
434       if (--j >= left && ++gallopleft >= MIN_GALLOP_GLOBAL) {
435         PetscInt l1, l2, diff1, diff2;
436         do {
437           if (MIN_GALLOP_GLOBAL > 1) --MIN_GALLOP_GLOBAL;
438           /* search left for temp[i], can move up to that many of left up arr */
439           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, left, j, (atarr) + i * asize, &l2));
440           diff2 = j - l2;
441           Petsc_memmove2((arr) + (k - diff2 + 1) * asize, (arr) + (l2 + 1) * asize, diff2 * asize, (barr) + (k - diff2 + 1) * bsize, (barr) + (l2 + 1) * bsize, diff2 * bsize);
442           k -= diff2;
443           j = l2;
444           /* search temp for left[j], can copy up to that many of temp into arr */
445           PetscCall(PetscGallopSearchRight_Private(atarr, asize, cmp, ctx, 0, i, (arr) + j * asize, &l1));
446           diff1 = i - l1;
447           Petsc_memcpy2((arr) + (k - diff1 + 1) * asize, (atarr) + (l1 + 1) * asize, diff1 * asize, (barr) + (k - diff1 + 1) * bsize, (btarr) + (l1 + 1) * bsize, diff1 * bsize);
448           k -= diff1;
449           i = l1;
450           if (i < 0 || j < left) break;
451         } while (diff1 > MIN_GALLOP_GLOBAL || diff2 > MIN_GALLOP_GLOBAL);
452         ++MIN_GALLOP_GLOBAL;
453       }
454     }
455   }
456   if (i >= 0) Petsc_memcpy2((arr) + left * asize, atarr, (i + 1) * asize, (barr) + left * bsize, btarr, (i + 1) * bsize);
457   PetscFunctionReturn(PETSC_SUCCESS);
458 }
459 
460 /* Left is inclusive lower bound of array slice, start is start location of unsorted section, right is inclusive upper
461  bound of array slice. If unsure of where unsorted section starts or if entire length is unsorted pass start = left */
462 static inline PetscErrorCode PetscInsertionSort_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
463 {
464   PetscInt i = start == left ? start + 1 : start;
465 
466   PetscFunctionBegin;
467   for (; i <= right; ++i) {
468     PetscInt j = i - 1;
469     Petsc_memcpy(tarr, arr + (i * size), size);
470     while ((j >= left) && ((*cmp)(tarr, (arr) + j * size, ctx) < 0)) {
471       COPYSWAPPY(arr + (j + 1) * size, arr + j * size, tarr + size, size);
472       --j;
473     }
474     Petsc_memcpy((arr) + (j + 1) * size, tarr, size);
475   }
476   PetscFunctionReturn(PETSC_SUCCESS);
477 }
478 
479 static inline PetscErrorCode PetscInsertionSortWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
480 {
481   PetscInt i = start == left ? start + 1 : start;
482 
483   PetscFunctionBegin;
484   for (; i <= right; ++i) {
485     PetscInt j = i - 1;
486     Petsc_memcpy2(atarr, arr + (i * asize), asize, btarr, barr + (i * bsize), bsize);
487     while ((j >= left) && ((*cmp)(atarr, arr + (j * asize), ctx) < 0)) {
488       COPYSWAPPY2(arr + (j + 1) * asize, arr + j * asize, asize, barr + (j + 1) * bsize, barr + j * bsize, bsize, atarr + asize);
489       --j;
490     }
491     Petsc_memcpy2(arr + (j + 1) * asize, atarr, asize, barr + (j + 1) * bsize, btarr, bsize);
492   }
493   PetscFunctionReturn(PETSC_SUCCESS);
494 }
495 
496 /* See PetscInsertionSort_Private */
497 static inline PetscErrorCode PetscBinaryInsertionSort_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
498 {
499   PetscInt i = start == left ? start + 1 : start;
500 
501   PetscFunctionBegin;
502   for (; i <= right; ++i) {
503     PetscInt l = left, r = i;
504     Petsc_memcpy(tarr, arr + (i * size), size);
505     do {
506       const PetscInt m = l + ((r - l) >> 1);
507       if ((*cmp)(tarr, arr + (m * size), ctx) < 0) {
508         r = m;
509       } else {
510         l = m + 1;
511       }
512     } while (l < r);
513     Petsc_memmove(arr + ((l + 1) * size), arr + (l * size), (i - l) * size);
514     Petsc_memcpy(arr + (l * size), tarr, size);
515   }
516   PetscFunctionReturn(PETSC_SUCCESS);
517 }
518 
519 static inline PetscErrorCode PetscBinaryInsertionSortWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt left, PetscInt start, PetscInt right)
520 {
521   PetscInt i = start == left ? start + 1 : start;
522 
523   PetscFunctionBegin;
524   for (; i <= right; ++i) {
525     PetscInt l = left, r = i;
526     Petsc_memcpy2(atarr, arr + (i * asize), asize, btarr, barr + (i * bsize), bsize);
527     do {
528       const PetscInt m = l + ((r - l) >> 1);
529       if ((*cmp)(atarr, arr + (m * asize), ctx) < 0) {
530         r = m;
531       } else {
532         l = m + 1;
533       }
534     } while (l < r);
535     Petsc_memmove2(arr + ((l + 1) * asize), arr + (l * asize), (i - l) * asize, barr + ((l + 1) * bsize), barr + (l * bsize), (i - l) * bsize);
536     Petsc_memcpy2(arr + (l * asize), atarr, asize, barr + (l * bsize), btarr, bsize);
537   }
538   PetscFunctionReturn(PETSC_SUCCESS);
539 }
540 
541 typedef struct {
542   PetscInt size;
543   PetscInt start;
544 #if defined(__CRAY_AARCH64) /* segfaults with Cray compilers for aarch64 on a64FX */
545 } PetscTimSortStack;
546 #else
547 } PetscTimSortStack PETSC_ATTRIBUTEALIGNED(2 * sizeof(PetscInt));
548 #endif
549 
550 typedef struct {
551   char *ptr PETSC_ATTRIBUTEALIGNED(PETSC_MEMALIGN);
552   size_t    size;
553   size_t    maxsize;
554 } PetscTimSortBuffer;
555 
556 static inline PetscErrorCode PetscTimSortResizeBuffer_Private(PetscTimSortBuffer *buff, size_t newSize)
557 {
558   PetscFunctionBegin;
559   if (PetscLikely(newSize <= buff->size)) PetscFunctionReturn(PETSC_SUCCESS);
560   {
561     /* Can't be larger than n, there is merit to simply allocating buff to n to begin with */
562     size_t newMax = PetscMin(newSize * newSize, buff->maxsize);
563     PetscCall(PetscFree(buff->ptr));
564     PetscCall(PetscMalloc1(newMax, &buff->ptr));
565     buff->size = newMax;
566   }
567   PetscFunctionReturn(PETSC_SUCCESS);
568 }
569 
570 static inline PetscErrorCode PetscTimSortForceCollapse_Private(char *arr, size_t size, CompFunc cmp, void *ctx, PetscTimSortBuffer *buff, PetscTimSortStack *stack, PetscInt stacksize)
571 {
572   PetscFunctionBegin;
573   for (; stacksize; --stacksize) {
574     /* A = stack[i-1], B = stack[i], if A[-1] <= B[0] means sorted */
575     if ((*cmp)(arr + (stack[stacksize].start - 1) * size, arr + (stack[stacksize].start) * size, ctx) > 0) {
576       PetscInt l, m = stack[stacksize].start, r;
577       /* Search A for B[0] insertion */
578       PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[stacksize - 1].start, stack[stacksize].start - 1, (arr) + (stack[stacksize].start) * size, &l));
579       /* Search B for A[-1] insertion */
580       PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[stacksize].start, stack[stacksize].start + stack[stacksize].size - 1, (arr) + (stack[stacksize].start - 1) * size, &r));
581       if (m - l <= r - m) {
582         PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
583         PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
584       } else {
585         PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
586         PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
587       }
588     }
589     /* Update A with merge */
590     stack[stacksize - 1].size += stack[stacksize].size;
591   }
592   PetscFunctionReturn(PETSC_SUCCESS);
593 }
594 
595 static inline PetscErrorCode PetscTimSortForceCollapseWithArray_Private(char *arr, size_t asize, char *barr, size_t bsize, CompFunc cmp, void *ctx, PetscTimSortBuffer *abuff, PetscTimSortBuffer *bbuff, PetscTimSortStack *stack, PetscInt stacksize)
596 {
597   PetscFunctionBegin;
598   for (; stacksize; --stacksize) {
599     /* A = stack[i-1], B = stack[i], if A[-1] <= B[0] means sorted */
600     if ((*cmp)(arr + (stack[stacksize].start - 1) * asize, arr + (stack[stacksize].start) * asize, ctx) > 0) {
601       PetscInt l, m = stack[stacksize].start, r;
602       /* Search A for B[0] insertion */
603       PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[stacksize - 1].start, stack[stacksize].start - 1, (arr) + (stack[stacksize].start) * asize, &l));
604       /* Search B for A[-1] insertion */
605       PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[stacksize].start, stack[stacksize].start + stack[stacksize].size - 1, (arr) + (stack[stacksize].start - 1) * asize, &r));
606       if (m - l <= r - m) {
607         PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
608         PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
609         PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
610       } else {
611         PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
612         PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
613         PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
614       }
615     }
616     /* Update A with merge */
617     stack[stacksize - 1].size += stack[stacksize].size;
618   }
619   PetscFunctionReturn(PETSC_SUCCESS);
620 }
621 
622 static inline PetscErrorCode PetscTimSortMergeCollapse_Private(char *arr, size_t size, CompFunc cmp, void *ctx, PetscTimSortBuffer *buff, PetscTimSortStack *stack, PetscInt *stacksize)
623 {
624   PetscInt i = *stacksize;
625 
626   PetscFunctionBegin;
627   while (i) {
628     PetscInt l, m, r, itemp = i;
629 
630     if (i == 1) {
631       /* A = stack[i-1], B = stack[i] */
632       if (stack[i - 1].size < stack[i].size) {
633         /* if A[-1] <= B[0] then sorted */
634         if ((*cmp)(arr + (stack[i].start - 1) * size, arr + (stack[i].start) * size, ctx) > 0) {
635           m = stack[i].start;
636           /* Search A for B[0] insertion */
637           PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i - 1].start, stack[i].start - 1, (arr) + stack[i].start * size, &l));
638           /* Search B for A[-1] insertion */
639           PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, arr + (stack[i].start - 1) * size, &r));
640           if (m - l <= r - m) {
641             PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
642             PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
643           } else {
644             PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
645             PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
646           }
647         }
648         /* Update A with merge */
649         stack[i - 1].size += stack[i].size;
650         --i;
651       }
652     } else {
653       /* i > 2, i.e. C exists
654        A = stack[i-2], B = stack[i-1], C = stack[i]; */
655       if (stack[i - 2].size <= stack[i - 1].size + stack[i].size) {
656         if (stack[i - 2].size < stack[i].size) {
657           /* merge B into A, but if A[-1] <= B[0] then already sorted */
658           if ((*cmp)(arr + (stack[i - 1].start - 1) * size, arr + (stack[i - 1].start) * size, ctx) > 0) {
659             m = stack[i - 1].start;
660             /* Search A for B[0] insertion */
661             PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i - 2].start, stack[i - 1].start - 1, (arr) + (stack[i - 1].start) * size, &l));
662             /* Search B for A[-1] insertion */
663             PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i - 1].start, stack[i - 1].start + stack[i - 1].size - 1, (arr) + (stack[i - 1].start - 1) * size, &r));
664             if (m - l <= r - m) {
665               PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
666               PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
667             } else {
668               PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
669               PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
670             }
671           }
672           /* Update A with merge */
673           stack[i - 2].size += stack[i - 1].size;
674           /* Push C up the stack */
675           stack[i - 1].start = stack[i].start;
676           stack[i - 1].size  = stack[i].size;
677         } else {
678         /* merge C into B */
679         mergeBC:
680           /* If B[-1] <= C[0] then... you know the drill */
681           if ((*cmp)(arr + (stack[i].start - 1) * size, arr + (stack[i].start) * size, ctx) > 0) {
682             m = stack[i].start;
683             /* Search B for C[0] insertion */
684             PetscCall(PetscGallopSearchLeft_Private(arr, size, cmp, ctx, stack[i - 1].start, stack[i].start - 1, arr + stack[i].start * size, &l));
685             /* Search C for B[-1] insertion */
686             PetscCall(PetscGallopSearchRight_Private(arr, size, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, (arr) + (stack[i].start - 1) * size, &r));
687             if (m - l <= r - m) {
688               PetscCall(PetscTimSortResizeBuffer_Private(buff, (m - l + 1) * size));
689               PetscCall(PetscTimSortMergeLo_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
690             } else {
691               PetscCall(PetscTimSortResizeBuffer_Private(buff, (r - m + 1) * size));
692               PetscCall(PetscTimSortMergeHi_Private(arr, buff->ptr, size, cmp, ctx, l, m, r));
693             }
694           }
695           /* Update B with merge */
696           stack[i - 1].size += stack[i].size;
697         }
698         --i;
699       } else if (stack[i - 1].size <= stack[i].size) {
700         /* merge C into B */
701         goto mergeBC;
702       }
703     }
704     if (itemp == i) break;
705   }
706   *stacksize = i;
707   PetscFunctionReturn(PETSC_SUCCESS);
708 }
709 
710 static inline PetscErrorCode PetscTimSortMergeCollapseWithArray_Private(char *arr, size_t asize, char *barr, size_t bsize, CompFunc cmp, void *ctx, PetscTimSortBuffer *abuff, PetscTimSortBuffer *bbuff, PetscTimSortStack *stack, PetscInt *stacksize)
711 {
712   PetscInt i = *stacksize;
713 
714   PetscFunctionBegin;
715   while (i) {
716     PetscInt l, m, r, itemp = i;
717 
718     if (i == 1) {
719       /* A = stack[i-1], B = stack[i] */
720       if (stack[i - 1].size < stack[i].size) {
721         /* if A[-1] <= B[0] then sorted */
722         if ((*cmp)(arr + (stack[i].start - 1) * asize, arr + (stack[i].start) * asize, ctx) > 0) {
723           m = stack[i].start;
724           /* Search A for B[0] insertion */
725           PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i - 1].start, stack[i].start - 1, (arr) + stack[i].start * asize, &l));
726           /* Search B for A[-1] insertion */
727           PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, arr + (stack[i].start - 1) * asize, &r));
728           if (m - l <= r - m) {
729             PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
730             PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
731             PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
732           } else {
733             PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
734             PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
735             PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
736           }
737         }
738         /* Update A with merge */
739         stack[i - 1].size += stack[i].size;
740         --i;
741       }
742     } else {
743       /* i > 2, i.e. C exists
744        A = stack[i-2], B = stack[i-1], C = stack[i]; */
745       if (stack[i - 2].size <= stack[i - 1].size + stack[i].size) {
746         if (stack[i - 2].size < stack[i].size) {
747           /* merge B into A, but if A[-1] <= B[0] then already sorted */
748           if ((*cmp)(arr + (stack[i - 1].start - 1) * asize, arr + (stack[i - 1].start) * asize, ctx) > 0) {
749             m = stack[i - 1].start;
750             /* Search A for B[0] insertion */
751             PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i - 2].start, stack[i - 1].start - 1, (arr) + (stack[i - 1].start) * asize, &l));
752             /* Search B for A[-1] insertion */
753             PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i - 1].start, stack[i - 1].start + stack[i - 1].size - 1, (arr) + (stack[i - 1].start - 1) * asize, &r));
754             if (m - l <= r - m) {
755               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
756               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
757               PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
758             } else {
759               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
760               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
761               PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
762             }
763           }
764           /* Update A with merge */
765           stack[i - 2].size += stack[i - 1].size;
766           /* Push C up the stack */
767           stack[i - 1].start = stack[i].start;
768           stack[i - 1].size  = stack[i].size;
769         } else {
770         /* merge C into B */
771         mergeBC:
772           /* If B[-1] <= C[0] then... you know the drill */
773           if ((*cmp)(arr + (stack[i].start - 1) * asize, arr + (stack[i].start) * asize, ctx) > 0) {
774             m = stack[i].start;
775             /* Search B for C[0] insertion */
776             PetscCall(PetscGallopSearchLeft_Private(arr, asize, cmp, ctx, stack[i - 1].start, stack[i].start - 1, arr + stack[i].start * asize, &l));
777             /* Search C for B[-1] insertion */
778             PetscCall(PetscGallopSearchRight_Private(arr, asize, cmp, ctx, stack[i].start, stack[i].start + stack[i].size - 1, (arr) + (stack[i].start - 1) * asize, &r));
779             if (m - l <= r - m) {
780               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (m - l + 1) * asize));
781               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (m - l + 1) * bsize));
782               PetscCall(PetscTimSortMergeLoWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
783             } else {
784               PetscCall(PetscTimSortResizeBuffer_Private(abuff, (r - m + 1) * asize));
785               PetscCall(PetscTimSortResizeBuffer_Private(bbuff, (r - m + 1) * bsize));
786               PetscCall(PetscTimSortMergeHiWithArray_Private(arr, abuff->ptr, asize, barr, bbuff->ptr, bsize, cmp, ctx, l, m, r));
787             }
788           }
789           /* Update B with merge */
790           stack[i - 1].size += stack[i].size;
791         }
792         --i;
793       } else if (stack[i - 1].size <= stack[i].size) {
794         /* merge C into B */
795         goto mergeBC;
796       }
797     }
798     if (itemp == i) break;
799   }
800   *stacksize = i;
801   PetscFunctionReturn(PETSC_SUCCESS);
802 }
803 
804 /* March sequentially through the array building up a "run" of weakly increasing or strictly decreasing contiguous
805  elements. Decreasing runs are reversed by swapping. If the run is less than minrun, artificially extend it via either
806  binary insertion sort or regulat insertion sort */
807 static inline PetscErrorCode PetscTimSortBuildRun_Private(char *arr, char *tarr, size_t size, CompFunc cmp, void *ctx, PetscInt n, PetscInt minrun, PetscInt runstart, PetscInt *runend)
808 {
809   const PetscInt re = PetscMin(runstart + minrun, n - 1);
810   PetscInt       ri = runstart;
811 
812   PetscFunctionBegin;
813   if (PetscUnlikely(runstart == n - 1)) {
814     *runend = runstart;
815     PetscFunctionReturn(PETSC_SUCCESS);
816   }
817   /* guess whether run is ascending or descending and tally up the longest consecutive run. essentially a coinflip for random data */
818   if ((*cmp)((arr) + (ri + 1) * size, (arr) + ri * size, ctx) < 0) {
819     ++ri;
820     while (ri < n - 1) {
821       if ((*cmp)((arr) + (ri + 1) * size, (arr) + ri * size, ctx) >= 0) break;
822       ++ri;
823     }
824     {
825       PetscInt lo = runstart, hi = ri;
826       for (; lo < hi; ++lo, --hi) COPYSWAPPY(arr + lo * size, arr + hi * size, tarr, size);
827     }
828   } else {
829     ++ri;
830     while (ri < n - 1) {
831       if ((*cmp)((arr) + (ri + 1) * size, (arr) + ri * size, ctx) < 0) break;
832       ++ri;
833     }
834   }
835   if (ri < re) {
836     /* the attempt failed, this section likely contains random data. If ri got close to minrun (within 50%) then we try
837      binary search */
838     if (ri - runstart <= minrun >> 1) {
839       ++MIN_GALLOP_GLOBAL; /* didn't get close hedge our bets against random data */
840       PetscCall(PetscInsertionSort_Private(arr, tarr, size, cmp, ctx, runstart, ri, re));
841     } else {
842       PetscCall(PetscBinaryInsertionSort_Private(arr, tarr, size, cmp, ctx, runstart, ri, re));
843     }
844     *runend = re;
845   } else *runend = ri;
846   PetscFunctionReturn(PETSC_SUCCESS);
847 }
848 
849 static inline PetscErrorCode PetscTimSortBuildRunWithArray_Private(char *arr, char *atarr, size_t asize, char *barr, char *btarr, size_t bsize, CompFunc cmp, void *ctx, PetscInt n, PetscInt minrun, PetscInt runstart, PetscInt *runend)
850 {
851   const PetscInt re = PetscMin(runstart + minrun, n - 1);
852   PetscInt       ri = runstart;
853 
854   PetscFunctionBegin;
855   if (PetscUnlikely(runstart == n - 1)) {
856     *runend = runstart;
857     PetscFunctionReturn(PETSC_SUCCESS);
858   }
859   /* guess whether run is ascending or descending and tally up the longest consecutive run. essentially a coinflip for random data */
860   if ((*cmp)((arr) + (ri + 1) * asize, arr + (ri * asize), ctx) < 0) {
861     ++ri;
862     while (ri < n - 1) {
863       if ((*cmp)((arr) + (ri + 1) * asize, (arr) + ri * asize, ctx) >= 0) break;
864       ++ri;
865     }
866     {
867       PetscInt lo = runstart, hi = ri;
868       for (; lo < hi; ++lo, --hi) COPYSWAPPY2(arr + lo * asize, arr + hi * asize, asize, barr + lo * bsize, barr + hi * bsize, bsize, atarr);
869     }
870   } else {
871     ++ri;
872     while (ri < n - 1) {
873       if ((*cmp)((arr) + (ri + 1) * asize, (arr) + ri * asize, ctx) < 0) break;
874       ++ri;
875     }
876   }
877   if (ri < re) {
878     /* the attempt failed, this section likely contains random data. If ri got close to minrun (within 50%) then we try
879      binary search */
880     if (ri - runstart <= minrun >> 1) {
881       ++MIN_GALLOP_GLOBAL; /* didn't get close hedge our bets against random data */
882       PetscCall(PetscInsertionSortWithArray_Private(arr, atarr, asize, barr, btarr, bsize, cmp, ctx, runstart, ri, re));
883     } else {
884       PetscCall(PetscBinaryInsertionSortWithArray_Private(arr, atarr, asize, barr, btarr, bsize, cmp, ctx, runstart, ri, re));
885     }
886     *runend = re;
887   } else *runend = ri;
888   PetscFunctionReturn(PETSC_SUCCESS);
889 }
890 
891 /*@C
892   PetscTimSort - Sorts an array in place in increasing order using Tim Peters adaptive sorting algorithm.
893 
894   Not Collective
895 
896   Input Parameters:
897 + n    - number of values
898 . arr  - array to be sorted
899 . size - size in bytes of the datatype held in arr
900 . cmp  - function pointer to comparison function
901 - ctx  - optional context to be passed to comparison function, NULL if not needed
902 
903   Output Parameter:
904 . arr  - sorted array
905 
906   Notes:
907   Timsort makes the assumption that input data is already likely partially ordered, or that it contains contiguous
908  sections (termed 'runs') where the data is locally ordered (but not necessarily globally ordered). It therefore aims to
909  select slices of the array in such a way that resulting mergesorts operate on near perfectly length-balanced arrays. To
910  do so it repeatedly triggers attempts throughout to merge adjacent runs.
911 
912   Should one run continuously "win" a comparison the algorithm begins the "gallop" phase. It will aggressively search
913   the "winner" for the location of the "losers" next entry (and vice versa) to copy all preceding elements into place in
914   bulk. However if the data is truly unordered (as is the case with random data) the immense gains possible from these
915   searches are expected __not__ to repay their costs. While adjacent arrays are almost all nearly the same size, they
916   likely all contain similar data.
917 
918   Sample usage:
919   The comparison function must follow the `qsort()` comparison function paradigm, returning the sign of the difference
920   between its arguments. If left < right : return -1, if left == right : return 0, if left > right : return 1. The user
921   may also
922  change or reverse the order of the sort by flipping the above. Note that stability of the sort is only guaranteed if
923  the comparison function forms a valid trigraph. For example when sorting an array of type "my_type" in increasing
924   order
925 
926 .vb
927   int my_increasing_comparison_function(const void *left, const void *right, void *ctx) {
928     my_type l = *(my_type *) left, r = *(my_type *) right;
929     return (l < r) ? -1 : (l > r);
930   }
931 .ve
932   Note the context is unused here but you may use it to pass and subsequently access whatever information required
933   inside the comparison function. The context pointer will unaltered except for any changes made inside the comparison function.
934   Then pass the function
935 .vb
936   PetscTimSort(n, arr, sizeof(arr[0]), my_increasing_comparison_function, ctx)
937 .ve
938 
939   Fortran Notes:
940   To use this from fortran you must write a comparison subroutine with 4 arguments which accepts left, right, ctx and
941   returns result. For example
942 .vb
943  subroutine CompareIntegers(left,right,ctx,result)
944    implicit none
945 
946    PetscInt,intent(in) :: left, right
947    type(UserCtx)       :: ctx
948    integer,intent(out) :: result
949 
950    if (left < right) then
951      result = -1
952    else if (left == right) then
953      result = 0
954    else
955      result = 1
956    end if
957    return
958  end subroutine CompareIntegers
959 .ve
960 
961   References:
962 . * - Tim Peters. https://bugs.python.org/file4451/timsort.txt
963 
964   Level: developer
965 
966 .seealso: `PetscTimSortWithArray()`, `PetscIntSortSemiOrdered()`, `PetscRealSortSemiOrdered()`, `PetscMPIIntSortSemiOrdered()`
967 @*/
968 PetscErrorCode PetscTimSort(PetscInt n, void *arr, size_t size, int (*cmp)(const void *, const void *, void *), void *ctx)
969 {
970   PetscInt           stacksize = 0, minrun, runstart = 0, runend = 0;
971   PetscTimSortStack  runstack[128];
972   PetscTimSortBuffer buff;
973   /* stacksize  = log_phi(n) = log_2(n)/log_2(phi), so 128 is enough for ~5.614e26 elements.
974    It is so unlikely that this limit is reached that this is __never__ checked for */
975 
976   PetscFunctionBegin;
977   /* Compute minrun. Minrun should be (32, 65) such that N/minrun
978    is a power of 2 or one plus a power of 2 */
979   {
980     PetscInt t = n, r = 0;
981     /* r becomes 1 if the least significant bits contain at least one off bit */
982     while (t >= 64) {
983       r |= t & 1;
984       t >>= 1;
985     }
986     minrun = t + r;
987   }
988   if (PetscDefined(USE_DEBUG)) {
989     PetscCall(PetscInfo(NULL, "minrun = %" PetscInt_FMT "\n", minrun));
990     if (n < 64) {
991       PetscCall(PetscInfo(NULL, "n %" PetscInt_FMT " < 64, consider using PetscSortInt() instead\n", n));
992     } else PetscCheck(!(minrun < 32) && !(minrun > 65), PETSC_COMM_SELF, PETSC_ERR_PLIB, "Calculated minrun %" PetscInt_FMT " not in range (32,65)", minrun);
993   }
994   PetscCall(PetscMalloc1((size_t)minrun * size, &buff.ptr));
995   buff.size         = (size_t)minrun * size;
996   buff.maxsize      = (size_t)n * size;
997   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
998   while (runstart < n) {
999     /* Check if additional entries are at least partially ordered and build natural run */
1000     PetscCall(PetscTimSortBuildRun_Private((char *)arr, buff.ptr, size, cmp, ctx, n, minrun, runstart, &runend));
1001     runstack[stacksize].start = runstart;
1002     runstack[stacksize].size  = runend - runstart + 1;
1003     PetscCall(PetscTimSortMergeCollapse_Private((char *)arr, size, cmp, ctx, &buff, runstack, &stacksize));
1004     ++stacksize;
1005     runstart = runend + 1;
1006   }
1007   /* Have been inside while, so discard last stacksize++ */
1008   --stacksize;
1009   PetscCall(PetscTimSortForceCollapse_Private((char *)arr, size, cmp, ctx, &buff, runstack, stacksize));
1010   PetscCall(PetscFree(buff.ptr));
1011   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
1012   PetscFunctionReturn(PETSC_SUCCESS);
1013 }
1014 
1015 /*@C
1016   PetscTimSortWithArray - Sorts an array in place in increasing order using Tim Peters adaptive sorting algorithm and
1017   reorders a second array to match the first. The arrays need not be the same type.
1018 
1019   Not Collective
1020 
1021   Input Parameters:
1022 + n     - number of values
1023 . asize - size in bytes of the datatype held in arr
1024 . bsize - size in bytes of the datatype held in barr
1025 . cmp   - function pointer to comparison function
1026 - ctx   - optional context to be passed to comparison function, NULL if not needed
1027 
1028   Input/Output Parameters:
1029 + arr  - array to be sorted, on output it is sorted
1030 - barr - array to be reordered, on output it is reordered
1031 
1032   Notes:
1033   The arrays need not be of the same type, however barr MUST contain at least as many elements as arr and the two CANNOT
1034   overlap.
1035 
1036   Timsort makes the assumption that input data is already likely partially ordered, or that it contains contiguous
1037   sections (termed 'runs') where the data is locally ordered (but not necessarily globally ordered). It therefore aims
1038  to select slices of the array in such a way that resulting mergesorts operate on near perfectly length-balanced
1039  arrays. To do so it repeatedly triggers attempts throughout to merge adjacent runs.
1040 
1041   Should one run continuously "win" a comparison the algorithm begins the "gallop" phase. It will aggressively search
1042   the "winner" for the location of the "losers" next entry (and vice versa) to copy all preceding elements into place in
1043   bulk. However if the data is truly unordered (as is the case with random data) the immense gains possible from these
1044   searches are expected __not__ to repay their costs. While adjacent arrays are almost all nearly the same size, they
1045   likely all contain similar data.
1046 
1047   Sample usage:
1048   The comparison function must follow the `qsort()` comparison function paradigm, returning the sign of the difference
1049   between its arguments. If left < right : return -1, if left == right : return 0, if left > right : return 1. The user
1050   may also change or reverse the order of the sort by flipping the above. Note that stability of the sort is only
1051   guaranteed if the comparison function forms a valid trigraph. For example when sorting an array of type "my_type" in
1052   increasing order
1053 
1054 .vb
1055   int my_increasing_comparison_function(const void *left, const void *right, void *ctx) {
1056     my_type l = *(my_type *) left, r = *(my_type *) right;
1057     return (l < r) ? -1 : (l > r);
1058   }
1059 .ve
1060   Note the context is unused here but you may use it to pass and subsequently access whatever information required
1061   inside the comparison function. The context pointer will unaltered except for any changes made inside the comparison function.
1062   Then pass the function
1063 .vb
1064   PetscTimSortWithArray(n, arr, sizeof(arr[0]), barr, sizeof(barr[0]), my_increasing_comparison_function, ctx)
1065 .ve
1066 
1067   Fortran Notes:
1068   To use this from fortran you must write a comparison subroutine with 4 arguments which accepts left, right, ctx and
1069   returns result. For example
1070 .vb
1071  subroutine CompareIntegers(left,right,ctx,result)
1072    implicit none
1073 
1074    PetscInt,intent(in) :: left, right
1075    type(UserCtx)       :: ctx
1076    integer,intent(out) :: result
1077 
1078    if (left < right) then
1079      result = -1
1080    else if (left == right) then
1081      result = 0
1082    else
1083      result = 1
1084    end if
1085    return
1086  end subroutine CompareIntegers
1087 .ve
1088 
1089   References:
1090 . * - Tim Peters. https://bugs.python.org/file4451/timsort.txt
1091 
1092   Level: developer
1093 
1094 .seealso: `PetscTimSort()`, `PetscIntSortSemiOrderedWithArray()`, `PetscRealSortSemiOrderedWithArrayInt()`, `PetscMPIIntSortSemiOrderedWithArray()`
1095 @*/
1096 PetscErrorCode PetscTimSortWithArray(PetscInt n, void *arr, size_t asize, void *barr, size_t bsize, int (*cmp)(const void *, const void *, void *), void *ctx)
1097 {
1098   PetscInt           stacksize = 0, minrun, runstart = 0, runend = 0;
1099   PetscTimSortStack  runstack[128];
1100   PetscTimSortBuffer abuff, bbuff;
1101   /* stacksize  = log_phi(n) = log_2(n)/log_2(phi), so 128 is enough for ~5.614e26 elements.
1102    It is so unlikely that this limit is reached that this is __never__ checked for */
1103 
1104   PetscFunctionBegin;
1105   /* Compute minrun. Minrun should be (32, 65) such that N/minrun
1106    is a power of 2 or one plus a power of 2 */
1107   {
1108     PetscInt t = n, r = 0;
1109     /* r becomes 1 if the least significant bits contain at least one off bit */
1110     while (t >= 64) {
1111       r |= t & 1;
1112       t >>= 1;
1113     }
1114     minrun = t + r;
1115   }
1116   if (PetscDefined(USE_DEBUG)) {
1117     PetscCall(PetscInfo(NULL, "minrun = %" PetscInt_FMT "\n", minrun));
1118     PetscCheck(n < 64 || (minrun >= 32 && minrun <= 65), PETSC_COMM_SELF, PETSC_ERR_PLIB, "Calculated minrun %" PetscInt_FMT " not in range (32,65)", minrun);
1119   }
1120   PetscCall(PetscMalloc1((size_t)minrun * asize, &abuff.ptr));
1121   abuff.size    = (size_t)minrun * asize;
1122   abuff.maxsize = (size_t)n * asize;
1123   PetscCall(PetscMalloc1((size_t)minrun * bsize, &bbuff.ptr));
1124   bbuff.size        = (size_t)minrun * bsize;
1125   bbuff.maxsize     = (size_t)n * bsize;
1126   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
1127   while (runstart < n) {
1128     /* Check if additional entries are at least partially ordered and build natural run */
1129     PetscCall(PetscTimSortBuildRunWithArray_Private((char *)arr, abuff.ptr, asize, (char *)barr, bbuff.ptr, bsize, cmp, ctx, n, minrun, runstart, &runend));
1130     runstack[stacksize].start = runstart;
1131     runstack[stacksize].size  = runend - runstart + 1;
1132     PetscCall(PetscTimSortMergeCollapseWithArray_Private((char *)arr, asize, (char *)barr, bsize, cmp, ctx, &abuff, &bbuff, runstack, &stacksize));
1133     ++stacksize;
1134     runstart = runend + 1;
1135   }
1136   /* Have been inside while, so discard last stacksize++ */
1137   --stacksize;
1138   PetscCall(PetscTimSortForceCollapseWithArray_Private((char *)arr, asize, (char *)barr, bsize, cmp, ctx, &abuff, &bbuff, runstack, stacksize));
1139   PetscCall(PetscFree(abuff.ptr));
1140   PetscCall(PetscFree(bbuff.ptr));
1141   MIN_GALLOP_GLOBAL = MIN_GALLOP_CONST_GLOBAL;
1142   PetscFunctionReturn(PETSC_SUCCESS);
1143 }
1144 
1145 /*@
1146    PetscIntSortSemiOrdered - Sorts an array of `PetscInt` in place in increasing order.
1147 
1148    Not Collective
1149 
1150    Input Parameters:
1151 +  n   - number of values
1152 -  arr - array of integers
1153 
1154    Output Parameter:
1155 .  arr - sorted array of integers
1156 
1157    Notes:
1158    If the array is less than 64 entries long `PetscSortInt()` is automatically used.
1159 
1160    This function serves as an alternative to `PetscSortInt()`. While this function works for any array of integers it is
1161    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1162    recommended that the user benchmark their code to see which routine is fastest.
1163 
1164    Level: intermediate
1165 
1166 .seealso: `PetscTimSort()`, `PetscSortInt()`, `PetscSortIntWithPermutation()`
1167 @*/
1168 PetscErrorCode PetscIntSortSemiOrdered(PetscInt n, PetscInt arr[])
1169 {
1170   PetscFunctionBegin;
1171   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1172   PetscValidIntPointer(arr, 2);
1173   if (n < 64) {
1174     PetscCall(PetscSortInt(n, arr));
1175   } else {
1176     PetscCall(PetscTimSort(n, arr, sizeof(PetscInt), Compare_PetscInt_Private, NULL));
1177   }
1178   PetscFunctionReturn(PETSC_SUCCESS);
1179 }
1180 
1181 /*@
1182    PetscIntSortSemiOrderedWithArray - Sorts an array of `PetscInt` in place in increasing order and reorders a second
1183    `PetscInt` array to match the first.
1184 
1185    Not Collective
1186 
1187    Input Parameter:
1188 .  n   - number of values
1189 
1190    Input/Output Parameters:
1191 +  arr1 - array of integers to be sorted, modified on output
1192 -  arr2 - array of integers to be reordered, modified on output
1193 
1194    Notes:
1195    The arrays CANNOT overlap.
1196 
1197    This function serves as an alternative to PetscSortIntWithArray(). While this function works for any array of integers it is
1198    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1199    recommended that the user benchmark their code to see which routine is fastest.
1200 
1201    Level: intermediate
1202 
1203 .seealso: `PetscTimSortWithArray()`, `PetscSortIntWithArray()`, `PetscSortIntWithPermutation()`
1204 @*/
1205 PetscErrorCode PetscIntSortSemiOrderedWithArray(PetscInt n, PetscInt arr1[], PetscInt arr2[])
1206 {
1207   PetscFunctionBegin;
1208   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1209   PetscValidIntPointer(arr1, 2);
1210   PetscValidIntPointer(arr2, 3);
1211   /* cannot export out to PetscIntSortWithArray here since it isn't stable */
1212   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscInt), arr2, sizeof(PetscInt), Compare_PetscInt_Private, NULL));
1213   PetscFunctionReturn(PETSC_SUCCESS);
1214 }
1215 
1216 /*@
1217    PetscMPIIntSortSemiOrdered - Sorts an array of `PetscMPIInt` in place in increasing order.
1218 
1219    Not Collective
1220 
1221    Input Parameters:
1222 +  n   - number of values
1223 -  arr - array of `PetscMPIInt`
1224 
1225    Output Parameter:
1226 .  arr - sorted array of integers
1227 
1228    Notes:
1229    If the array is less than 64 entries long `PetscSortMPIInt()` is automatically used.
1230 
1231    This function serves as an alternative to `PetscSortMPIInt()`. While this function works for any array of `PetscMPIInt` it is
1232    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1233    recommended that the user benchmark their code to see which routine is fastest.
1234 
1235    Level: intermediate
1236 
1237 .seealso: `PetscTimSort()`, `PetscSortMPIInt()`
1238 @*/
1239 PetscErrorCode PetscMPIIntSortSemiOrdered(PetscInt n, PetscMPIInt arr[])
1240 {
1241   PetscFunctionBegin;
1242   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1243   PetscValidIntPointer(arr, 2);
1244   if (n < 64) {
1245     PetscCall(PetscSortMPIInt(n, arr));
1246   } else {
1247     PetscCall(PetscTimSort(n, arr, sizeof(PetscMPIInt), Compare_PetscMPIInt_Private, NULL));
1248   }
1249   PetscFunctionReturn(PETSC_SUCCESS);
1250 }
1251 
1252 /*@
1253    PetscMPIIntSortSemiOrderedWithArray - Sorts an array of `PetscMPIInt` in place in increasing order and reorders a second `PetscMPIInt`
1254    array to match the first.
1255 
1256    Not Collective
1257 
1258    Input Parameter:
1259 .  n   - number of values
1260 
1261    Input/Output Parameters:
1262 .  arr1 - array of integers to be sorted, modified on output
1263 -  arr2 - array of integers to be reordered, modified on output
1264 
1265    Notes:
1266    The arrays CANNOT overlap.
1267 
1268    This function serves as an alternative to `PetscSortMPIIntWithArray()`. While this function works for any array of integers it is
1269    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1270    recommended that the user benchmark their code to see which routine is fastest.
1271 
1272    Level: intermediate
1273 
1274 .seealso: `PetscTimSortWithArray()`, `PetscSortMPIIntWithArray()`, `PetscSortMPIIntWithPermutation()`
1275 @*/
1276 PetscErrorCode PetscMPIIntSortSemiOrderedWithArray(PetscInt n, PetscMPIInt arr1[], PetscMPIInt arr2[])
1277 {
1278   PetscFunctionBegin;
1279   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1280   PetscValidIntPointer(arr1, 2);
1281   PetscValidIntPointer(arr2, 3);
1282   /* cannot export out to PetscMPIIntSortWithArray here since it isn't stable */
1283   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscMPIInt), arr2, sizeof(PetscMPIInt), Compare_PetscMPIInt_Private, NULL));
1284   PetscFunctionReturn(PETSC_SUCCESS);
1285 }
1286 
1287 /*@
1288    PetscRealSortSemiOrdered - Sorts an array of `PetscReal` in place in increasing order.
1289 
1290    Not Collective
1291 
1292    Input Parameters:
1293 +  n   - number of values
1294 -  arr - array of `PetscReal`
1295 
1296    Output Parameter:
1297 .  arr - sorted array of integers
1298 
1299    Notes:
1300    If the array is less than 64 entries long `PetscSortReal()` is automatically used.
1301 
1302    This function serves as an alternative to `PetscSortReal()`. While this function works for any array of `PetscReal` it is
1303    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1304    recommended that the user benchmark their code to see which routine is fastest.
1305 
1306    Level: intermediate
1307 
1308 .seealso: `PetscTimSort()`, `PetscSortReal()`, `PetscSortRealWithPermutation()`
1309 @*/
1310 PetscErrorCode PetscRealSortSemiOrdered(PetscInt n, PetscReal arr[])
1311 {
1312   PetscFunctionBegin;
1313   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1314   PetscValidRealPointer(arr, 2);
1315   if (n < 64) {
1316     PetscCall(PetscSortReal(n, arr));
1317   } else {
1318     PetscCall(PetscTimSort(n, arr, sizeof(PetscReal), Compare_PetscReal_Private, NULL));
1319   }
1320   PetscFunctionReturn(PETSC_SUCCESS);
1321 }
1322 
1323 /*@
1324    PetscRealSortSemiOrderedWithArrayInt - Sorts an array of `PetscReal` in place in increasing order and reorders a second
1325    array of `PetscInt` to match the first.
1326 
1327    Not Collective
1328 
1329    Input Parameter:
1330 .  n   - number of values
1331 
1332    Input/Output Parameters:
1333 .  arr1 - array of `PetscReal` to be sorted, modified on output
1334 -  arr2 - array of `PetscInt` to be reordered, modified on output
1335 
1336    Notes:
1337    This function serves as an alternative to `PetscSortRealWithArray()`. While this function works for any array of `PetscReal` it is
1338    significantly faster if the array is not totally random. There are exceptions to this and so it is __highly__
1339    recommended that the user benchmark their code to see which routine is fastest.
1340 
1341    Level: intermediate
1342 
1343 .seealso: `PetscTimSortWithArray()`, `PetscSortRealWithArrayInt()`, `PetscSortRealWithPermutation()`
1344 @*/
1345 PetscErrorCode PetscRealSortSemiOrderedWithArrayInt(PetscInt n, PetscReal arr1[], PetscInt arr2[])
1346 {
1347   PetscFunctionBegin;
1348   if (n <= 1) PetscFunctionReturn(PETSC_SUCCESS);
1349   PetscValidRealPointer(arr1, 2);
1350   PetscValidIntPointer(arr2, 3);
1351   /* cannot export out to PetscRealSortWithArrayInt here since it isn't stable */
1352   PetscCall(PetscTimSortWithArray(n, arr1, sizeof(PetscReal), arr2, sizeof(PetscInt), Compare_PetscReal_Private, NULL));
1353   PetscFunctionReturn(PETSC_SUCCESS);
1354 }
1355