xref: /petsc/src/mat/graphops/coarsen/impls/hem/hem.c (revision 5a67bb2bab7ce296578be4e8d1213f21203fd3df)
1 #include <petsc/private/matimpl.h> /*I "petscmat.h" I*/
2 #include <../src/mat/impls/aij/seq/aij.h>
3 #include <../src/mat/impls/aij/mpi/mpiaij.h>
4 
5 /* linked list methods
6  *
7  *  PetscCDCreate
8  */
9 PetscErrorCode PetscCDCreate(PetscInt a_size, PetscCoarsenData **a_out)
10 {
11   PetscCoarsenData *ail;
12 
13   PetscFunctionBegin;
14   /* allocate pool, partially */
15   PetscCall(PetscNew(&ail));
16   *a_out               = ail;
17   ail->pool_list.next  = NULL;
18   ail->pool_list.array = NULL;
19   ail->chk_sz          = 0;
20   /* allocate array */
21   ail->size = a_size;
22   PetscCall(PetscCalloc1(a_size, &ail->array));
23   ail->extra_nodes = NULL;
24   ail->mat         = NULL;
25   PetscFunctionReturn(PETSC_SUCCESS);
26 }
27 
28 /* PetscCDDestroy
29  */
30 PetscErrorCode PetscCDDestroy(PetscCoarsenData *ail)
31 {
32   PetscCDArrNd *n = &ail->pool_list;
33 
34   PetscFunctionBegin;
35   n = n->next;
36   while (n) {
37     PetscCDArrNd *lstn = n;
38     n                  = n->next;
39     PetscCall(PetscFree(lstn));
40   }
41   if (ail->pool_list.array) PetscCall(PetscFree(ail->pool_list.array));
42   PetscCall(PetscFree(ail->array));
43   if (ail->mat) PetscCall(MatDestroy(&ail->mat));
44   /* delete this (+agg+pool array) */
45   PetscCall(PetscFree(ail));
46   PetscFunctionReturn(PETSC_SUCCESS);
47 }
48 
49 /* PetscCDSetChunkSize
50  */
51 PetscErrorCode PetscCDSetChunkSize(PetscCoarsenData *ail, PetscInt a_sz)
52 {
53   PetscFunctionBegin;
54   ail->chk_sz = a_sz;
55   PetscFunctionReturn(PETSC_SUCCESS);
56 }
57 
58 /*  PetscCDGetNewNode
59  */
60 static PetscErrorCode PetscCDGetNewNode(PetscCoarsenData *ail, PetscCDIntNd **a_out, PetscInt a_id)
61 {
62   PetscFunctionBegin;
63   *a_out = NULL; /* squelch -Wmaybe-uninitialized */
64   if (ail->extra_nodes) {
65     PetscCDIntNd *node = ail->extra_nodes;
66     ail->extra_nodes   = node->next;
67     node->gid          = a_id;
68     node->next         = NULL;
69     *a_out             = node;
70   } else {
71     if (!ail->pool_list.array) {
72       if (!ail->chk_sz) ail->chk_sz = 10; /* use a chuck size of ail->size? */
73       PetscCall(PetscMalloc1(ail->chk_sz, &ail->pool_list.array));
74       ail->new_node       = ail->pool_list.array;
75       ail->new_left       = ail->chk_sz;
76       ail->new_node->next = NULL;
77     } else if (!ail->new_left) {
78       PetscCDArrNd *node;
79       PetscCall(PetscMalloc(ail->chk_sz * sizeof(PetscCDIntNd) + sizeof(PetscCDArrNd), &node));
80       node->array         = (PetscCDIntNd *)(node + 1);
81       node->next          = ail->pool_list.next;
82       ail->pool_list.next = node;
83       ail->new_left       = ail->chk_sz;
84       ail->new_node       = node->array;
85     }
86     ail->new_node->gid  = a_id;
87     ail->new_node->next = NULL;
88     *a_out              = ail->new_node++;
89     ail->new_left--;
90   }
91   PetscFunctionReturn(PETSC_SUCCESS);
92 }
93 
94 /* PetscCDIntNdSetID
95  */
96 PetscErrorCode PetscCDIntNdSetID(PetscCDIntNd *a_this, PetscInt a_id)
97 {
98   PetscFunctionBegin;
99   a_this->gid = a_id;
100   PetscFunctionReturn(PETSC_SUCCESS);
101 }
102 
103 /* PetscCDIntNdGetID
104  */
105 PetscErrorCode PetscCDIntNdGetID(const PetscCDIntNd *a_this, PetscInt *a_gid)
106 {
107   PetscFunctionBegin;
108   *a_gid = a_this->gid;
109   PetscFunctionReturn(PETSC_SUCCESS);
110 }
111 
112 /* PetscCDGetHeadPos
113  */
114 PetscErrorCode PetscCDGetHeadPos(const PetscCoarsenData *ail, PetscInt a_idx, PetscCDIntNd **pos)
115 {
116   PetscFunctionBegin;
117   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "a_idx >= ail->size: a_idx=%" PetscInt_FMT ".", a_idx);
118   *pos = ail->array[a_idx];
119   PetscFunctionReturn(PETSC_SUCCESS);
120 }
121 
122 /* PetscCDGetNextPos
123  */
124 PetscErrorCode PetscCDGetNextPos(const PetscCoarsenData *ail, PetscInt l_idx, PetscCDIntNd **pos)
125 {
126   PetscFunctionBegin;
127   PetscCheck((*pos), PETSC_COMM_SELF, PETSC_ERR_PLIB, "NULL input position.");
128   *pos = (*pos)->next;
129   PetscFunctionReturn(PETSC_SUCCESS);
130 }
131 
132 /* PetscCDAppendID
133  */
134 PetscErrorCode PetscCDAppendID(PetscCoarsenData *ail, PetscInt a_idx, PetscInt a_id)
135 {
136   PetscCDIntNd *n, *n2;
137 
138   PetscFunctionBegin;
139   PetscCall(PetscCDGetNewNode(ail, &n, a_id));
140   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_idx);
141   if (!(n2 = ail->array[a_idx])) ail->array[a_idx] = n;
142   else {
143     do {
144       if (!n2->next) {
145         n2->next = n;
146         PetscCheck(!n->next, PETSC_COMM_SELF, PETSC_ERR_PLIB, "n should not have a next");
147         break;
148       }
149       n2 = n2->next;
150     } while (n2);
151     PetscCheck(n2, PETSC_COMM_SELF, PETSC_ERR_PLIB, "n2 should be non-null");
152   }
153   PetscFunctionReturn(PETSC_SUCCESS);
154 }
155 
156 /* PetscCDAppendNode
157  */
158 PetscErrorCode PetscCDAppendNode(PetscCoarsenData *ail, PetscInt a_idx, PetscCDIntNd *a_n)
159 {
160   PetscCDIntNd *n2;
161 
162   PetscFunctionBegin;
163   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_idx);
164   if (!(n2 = ail->array[a_idx])) ail->array[a_idx] = a_n;
165   else {
166     do {
167       if (!n2->next) {
168         n2->next  = a_n;
169         a_n->next = NULL;
170         break;
171       }
172       n2 = n2->next;
173     } while (n2);
174     PetscCheck(n2, PETSC_COMM_SELF, PETSC_ERR_PLIB, "n2 should be non-null");
175   }
176   PetscFunctionReturn(PETSC_SUCCESS);
177 }
178 
179 /* PetscCDRemoveNextNode: a_last->next, this exposes single linked list structure to API (not used)
180  */
181 PetscErrorCode PetscCDRemoveNextNode(PetscCoarsenData *ail, PetscInt a_idx, PetscCDIntNd *a_last)
182 {
183   PetscCDIntNd *del;
184 
185   PetscFunctionBegin;
186   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_idx);
187   PetscCheck(a_last->next, PETSC_COMM_SELF, PETSC_ERR_PLIB, "a_last should have a next");
188   del          = a_last->next;
189   a_last->next = del->next;
190   /* del->next = NULL; -- this still used in a iterator so keep it intact -- need to fix this with a double linked list */
191   /* could reuse n2 but PetscCDAppendNode sometimes uses it */
192   PetscFunctionReturn(PETSC_SUCCESS);
193 }
194 
195 /* PetscCDPrint
196  */
197 PetscErrorCode PetscCDPrint(const PetscCoarsenData *ail, PetscInt my0, MPI_Comm comm)
198 {
199   PetscCDIntNd *n, *n2;
200   PetscInt      ii;
201 
202   PetscFunctionBegin;
203   for (ii = 0; ii < ail->size; ii++) {
204     n2 = n = ail->array[ii];
205     if (n) PetscCall(PetscSynchronizedPrintf(comm, "list %" PetscInt_FMT ":", ii + my0));
206     while (n) {
207       PetscCall(PetscSynchronizedPrintf(comm, " %" PetscInt_FMT, n->gid));
208       n = n->next;
209     }
210     if (n2) PetscCall(PetscSynchronizedPrintf(comm, "\n"));
211   }
212   PetscCall(PetscSynchronizedFlush(comm, PETSC_STDOUT));
213   PetscFunctionReturn(PETSC_SUCCESS);
214 }
215 
216 /* PetscCDMoveAppend - take list in a_srcidx and appends to destidx
217  */
218 PetscErrorCode PetscCDMoveAppend(PetscCoarsenData *ail, PetscInt a_destidx, PetscInt a_srcidx)
219 {
220   PetscCDIntNd *n;
221 
222   PetscFunctionBegin;
223   PetscCheck(a_srcidx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_srcidx);
224   PetscCheck(a_destidx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_destidx);
225   PetscCheck(a_destidx != a_srcidx, PETSC_COMM_SELF, PETSC_ERR_PLIB, "a_destidx==a_srcidx %" PetscInt_FMT ".", a_destidx);
226   n = ail->array[a_destidx];
227   if (!n) ail->array[a_destidx] = ail->array[a_srcidx];
228   else {
229     do {
230       if (!n->next) {
231         n->next = ail->array[a_srcidx]; // append
232         break;
233       }
234       n = n->next;
235     } while (1);
236   }
237   ail->array[a_srcidx] = NULL; // empty
238   PetscFunctionReturn(PETSC_SUCCESS);
239 }
240 
241 /* PetscCDRemoveAllAt - empty one list and move data to cache
242  */
243 PetscErrorCode PetscCDRemoveAllAt(PetscCoarsenData *ail, PetscInt a_idx)
244 {
245   PetscCDIntNd *rem, *n1;
246 
247   PetscFunctionBegin;
248   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_idx);
249   rem               = ail->array[a_idx];
250   ail->array[a_idx] = NULL;
251   if (!(n1 = ail->extra_nodes)) ail->extra_nodes = rem;
252   else {
253     while (n1->next) n1 = n1->next;
254     n1->next = rem;
255   }
256   PetscFunctionReturn(PETSC_SUCCESS);
257 }
258 
259 /* PetscCDCountAt
260  */
261 PetscErrorCode PetscCDCountAt(const PetscCoarsenData *ail, PetscInt a_idx, PetscInt *a_sz)
262 {
263   PetscCDIntNd *n1;
264   PetscInt      sz = 0;
265 
266   PetscFunctionBegin;
267   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_idx);
268   n1 = ail->array[a_idx];
269   while (n1) {
270     n1 = n1->next;
271     sz++;
272   }
273   *a_sz = sz;
274   PetscFunctionReturn(PETSC_SUCCESS);
275 }
276 
277 /* PetscCDSize
278  */
279 PetscErrorCode PetscCDCount(const PetscCoarsenData *ail, PetscInt *a_sz)
280 {
281   PetscInt sz = 0;
282 
283   PetscFunctionBegin;
284   for (int ii = 0; ii < ail->size; ii++) {
285     PetscCDIntNd *n1 = ail->array[ii];
286     while (n1) {
287       n1 = n1->next;
288       sz++;
289     }
290   }
291   *a_sz = sz;
292   PetscFunctionReturn(PETSC_SUCCESS);
293 }
294 
295 /* PetscCDIsEmptyAt - Is the list empty? (not used)
296  */
297 PetscErrorCode PetscCDIsEmptyAt(const PetscCoarsenData *ail, PetscInt a_idx, PetscBool *a_e)
298 {
299   PetscFunctionBegin;
300   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_idx);
301   *a_e = (PetscBool)(ail->array[a_idx] == NULL);
302   PetscFunctionReturn(PETSC_SUCCESS);
303 }
304 
305 /* PetscCDGetNonemptyIS - used for C-F methods
306  */
307 PetscErrorCode PetscCDGetNonemptyIS(PetscCoarsenData *ail, IS *a_mis)
308 {
309   PetscCDIntNd *n;
310   PetscInt      ii, kk;
311   PetscInt     *permute;
312 
313   PetscFunctionBegin;
314   for (ii = kk = 0; ii < ail->size; ii++) {
315     n = ail->array[ii];
316     if (n) kk++;
317   }
318   PetscCall(PetscMalloc1(kk, &permute));
319   for (ii = kk = 0; ii < ail->size; ii++) {
320     n = ail->array[ii];
321     if (n) permute[kk++] = ii;
322   }
323   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, kk, permute, PETSC_OWN_POINTER, a_mis));
324   PetscFunctionReturn(PETSC_SUCCESS);
325 }
326 
327 /* PetscCDGetMat
328  */
329 PetscErrorCode PetscCDGetMat(PetscCoarsenData *ail, Mat *a_mat)
330 {
331   PetscFunctionBegin;
332   *a_mat   = ail->mat;
333   ail->mat = NULL; // give it up
334   PetscFunctionReturn(PETSC_SUCCESS);
335 }
336 
337 /* PetscCDSetMat
338  */
339 PetscErrorCode PetscCDSetMat(PetscCoarsenData *ail, Mat a_mat)
340 {
341   PetscFunctionBegin;
342   if (ail->mat) {
343     PetscCall(MatDestroy(&ail->mat)); //should not happen
344   }
345   ail->mat = a_mat;
346   PetscFunctionReturn(PETSC_SUCCESS);
347 }
348 
349 /* PetscCDGetASMBlocks - get IS of aggregates for ASM smoothers
350  */
351 PetscErrorCode PetscCDGetASMBlocks(const PetscCoarsenData *ail, const PetscInt a_bs, PetscInt *a_sz, IS **a_local_is)
352 {
353   PetscCDIntNd *n;
354   PetscInt      lsz, ii, kk, *idxs, jj, gid;
355   IS           *is_loc = NULL;
356 
357   PetscFunctionBegin;
358   for (ii = kk = 0; ii < ail->size; ii++) {
359     if (ail->array[ii]) kk++;
360   }
361   *a_sz = kk;
362   PetscCall(PetscMalloc1(kk, &is_loc));
363   for (ii = kk = 0; ii < ail->size; ii++) {
364     for (lsz = 0, n = ail->array[ii]; n; lsz++, n = n->next) /* void */
365       ;
366     if (lsz) {
367       PetscCall(PetscMalloc1(a_bs * lsz, &idxs));
368       for (lsz = 0, n = ail->array[ii]; n; n = n->next) {
369         PetscCall(PetscCDIntNdGetID(n, &gid));
370         for (jj = 0; jj < a_bs; lsz++, jj++) idxs[lsz] = a_bs * gid + jj;
371       }
372       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, lsz, idxs, PETSC_OWN_POINTER, &is_loc[kk++]));
373     }
374   }
375   PetscCheck(*a_sz == kk, PETSC_COMM_SELF, PETSC_ERR_PLIB, "*a_sz %" PetscInt_FMT " != kk %" PetscInt_FMT, *a_sz, kk);
376   *a_local_is = is_loc; /* out */
377 
378   PetscFunctionReturn(PETSC_SUCCESS);
379 }
380 
381 /* edge for priority queue */
382 typedef struct edge_tag {
383   PetscReal weight;
384   PetscInt  lid0, gid1, ghost1_idx;
385 } Edge;
386 
387 #define MY_MEPS (PETSC_MACHINE_EPSILON * 100)
388 static int gamg_hem_compare(const void *a, const void *b)
389 {
390   PetscReal va = ((Edge *)a)->weight, vb = ((Edge *)b)->weight;
391   return (va <= vb - MY_MEPS) ? 1 : (va > vb + MY_MEPS) ? -1 : 0; /* 0 for equal */
392 }
393 /* static int gamg_hem_compare3(const void *a, const void *b, void *ctx) */
394 /* { */
395 /*   return gamg_hem_compare(a, b); */
396 /* } */
397 
398 /*
399   MatCoarsenApply_HEM_private - parallel heavy edge matching
400 
401   Input Parameter:
402    . a_Gmat - global matrix of the graph
403    . n_iter - number of matching iterations
404    . threshold - threshold for filtering graphs
405 
406   Output Parameter:
407    . a_locals_llist - array of list of local nodes rooted at local node
408 */
409 static PetscErrorCode MatCoarsenApply_HEM_private(Mat a_Gmat, const PetscInt n_iter, const PetscReal threshold, PetscCoarsenData **a_locals_llist)
410 {
411 #define REQ_BF_SIZE 100
412   PetscBool         isMPI;
413   MPI_Comm          comm;
414   PetscInt          ix, *ii, *aj, Iend, my0, ncomm_procs, bc_agg = -1, *rbuff = NULL, rbuff_sz = 0;
415   PetscMPIInt       rank, size, comm_procs[REQ_BF_SIZE], *lid_max_pe;
416   const PetscInt    nloc = a_Gmat->rmap->n, request_size = PetscCeilReal((PetscReal)sizeof(MPI_Request) / (PetscReal)sizeof(PetscInt));
417   PetscInt         *lid_cprowID;
418   PetscBool        *lid_matched;
419   Mat_SeqAIJ       *matA, *matB = NULL;
420   Mat_MPIAIJ       *mpimat     = NULL;
421   PetscScalar       one        = 1.;
422   PetscCoarsenData *agg_llists = NULL, *ghost_deleted_list = NULL, *bc_list = NULL;
423   Mat               cMat, tMat, P;
424   MatScalar        *ap;
425   IS                info_is;
426 
427   PetscFunctionBegin;
428   PetscCall(PetscObjectGetComm((PetscObject)a_Gmat, &comm));
429   PetscCallMPI(MPI_Comm_rank(comm, &rank));
430   PetscCallMPI(MPI_Comm_size(comm, &size));
431   PetscCall(MatGetOwnershipRange(a_Gmat, &my0, &Iend));
432   PetscCall(ISCreate(comm, &info_is));
433   PetscCall(PetscInfo(info_is, "%" PetscInt_FMT " iterations of HEM.\n", n_iter));
434 
435   PetscCall(PetscMalloc1(nloc, &lid_matched));
436   PetscCall(PetscMalloc1(nloc, &lid_cprowID));
437   PetscCall(PetscMalloc1(nloc, &lid_max_pe));
438 
439   PetscCall(PetscCDCreate(nloc, &agg_llists));
440   PetscCall(PetscCDSetChunkSize(agg_llists, nloc + 1));
441   *a_locals_llist = agg_llists;
442   /* add self to all lists */
443   for (int kk = 0; kk < nloc; kk++) PetscCall(PetscCDAppendID(agg_llists, kk, my0 + kk));
444   /* make a copy of the graph, this gets destroyed in iterates */
445   PetscCall(MatDuplicate(a_Gmat, MAT_COPY_VALUES, &cMat));
446   PetscCall(MatConvert(cMat, MATAIJ, MAT_INPLACE_MATRIX, &cMat));
447   isMPI = (PetscBool)(size > 1);
448   if (isMPI) {
449     /* list of deleted ghosts, should compress this */
450     PetscCall(PetscCDCreate(size, &ghost_deleted_list));
451     PetscCall(PetscCDSetChunkSize(ghost_deleted_list, 100));
452   }
453   for (int iter = 0; iter < n_iter; iter++) {
454     const PetscScalar *lghost_max_ew, *lid_max_ew;
455     PetscBool         *lghost_matched;
456     PetscMPIInt       *lghost_pe, *lghost_max_pe;
457     Vec                locMaxEdge, ghostMaxEdge, ghostMaxPE, locMaxPE;
458     PetscInt          *lghost_gid, nEdges, nEdges0, num_ghosts = 0;
459     Edge              *Edges;
460     const int          n_sub_its = 2500; // in case of a bug, stop at some point
461     /* get submatrices of cMat */
462     for (int kk = 0; kk < nloc; kk++) lid_cprowID[kk] = -1;
463     if (isMPI) {
464       mpimat = (Mat_MPIAIJ *)cMat->data;
465       matA   = (Mat_SeqAIJ *)mpimat->A->data;
466       matB   = (Mat_SeqAIJ *)mpimat->B->data;
467       if (!matB->compressedrow.use) {
468         /* force construction of compressed row data structure since code below requires it */
469         PetscCall(MatCheckCompressedRow(mpimat->B, matB->nonzerorowcnt, &matB->compressedrow, matB->i, mpimat->B->rmap->n, -1.0));
470       }
471       /* set index into compressed row 'lid_cprowID' */
472       for (ix = 0; ix < matB->compressedrow.nrows; ix++) {
473         PetscInt *ridx = matB->compressedrow.rindex, lid = ridx[ix];
474         if (ridx[ix] >= 0) lid_cprowID[lid] = ix;
475         else PetscCall(PetscPrintf(PETSC_COMM_SELF, "Missing slot in cprow? %d/%d ", (int)ix, (int)matB->compressedrow.nrows));
476       }
477     } else {
478       matA = (Mat_SeqAIJ *)cMat->data;
479     }
480     /* set matched flags: true for empty list */
481     for (int kk = 0; kk < nloc; kk++) {
482       PetscCall(PetscCDCountAt(agg_llists, kk, &ix));
483       if (ix > 0) lid_matched[kk] = PETSC_FALSE;
484       else lid_matched[kk] = PETSC_TRUE; // call deleted gids as matched
485     }
486     /* max edge and pe vecs */
487     PetscCall(MatCreateVecs(cMat, &locMaxEdge, NULL));
488     PetscCall(MatCreateVecs(cMat, &locMaxPE, NULL));
489     /* get 'lghost_pe' & 'lghost_gid' & init. 'lghost_matched' using 'mpimat->lvec' */
490     if (isMPI) {
491       Vec                vec;
492       PetscScalar        vval;
493       const PetscScalar *buf;
494       PetscCall(MatCreateVecs(cMat, &vec, NULL));
495       PetscCall(VecGetLocalSize(mpimat->lvec, &num_ghosts));
496       /* lghost_matched */
497       for (PetscInt kk = 0, gid = my0; kk < nloc; kk++, gid++) {
498         PetscScalar vval = lid_matched[kk] ? 1.0 : 0.0;
499         PetscCall(VecSetValues(vec, 1, &gid, &vval, INSERT_VALUES));
500       }
501       PetscCall(VecAssemblyBegin(vec));
502       PetscCall(VecAssemblyEnd(vec));
503       PetscCall(VecScatterBegin(mpimat->Mvctx, vec, mpimat->lvec, INSERT_VALUES, SCATTER_FORWARD));
504       PetscCall(VecScatterEnd(mpimat->Mvctx, vec, mpimat->lvec, INSERT_VALUES, SCATTER_FORWARD));
505       PetscCall(VecGetArrayRead(mpimat->lvec, &buf)); /* get proc ID in 'buf' */
506       PetscCall(PetscMalloc1(num_ghosts, &lghost_matched));
507       for (int kk = 0; kk < num_ghosts; kk++) {
508         lghost_matched[kk] = (PetscBool)(PetscRealPart(buf[kk]) != 0); // the proc of the ghost for now
509       }
510       PetscCall(VecRestoreArrayRead(mpimat->lvec, &buf));
511       /* lghost_pe */
512       vval = (PetscScalar)(rank);
513       for (PetscInt kk = 0, gid = my0; kk < nloc; kk++, gid++) PetscCall(VecSetValues(vec, 1, &gid, &vval, INSERT_VALUES)); /* set with GID */
514       PetscCall(VecAssemblyBegin(vec));
515       PetscCall(VecAssemblyEnd(vec));
516       PetscCall(VecScatterBegin(mpimat->Mvctx, vec, mpimat->lvec, INSERT_VALUES, SCATTER_FORWARD));
517       PetscCall(VecScatterEnd(mpimat->Mvctx, vec, mpimat->lvec, INSERT_VALUES, SCATTER_FORWARD));
518       PetscCall(VecGetArrayRead(mpimat->lvec, &buf)); /* get proc ID in 'buf' */
519       PetscCall(PetscMalloc1(num_ghosts, &lghost_pe));
520       for (int kk = 0; kk < num_ghosts; kk++) lghost_pe[kk] = (PetscMPIInt)PetscRealPart(buf[kk]); // the proc of the ghost for now
521       PetscCall(VecRestoreArrayRead(mpimat->lvec, &buf));
522       /* lghost_gid */
523       for (PetscInt kk = 0, gid = my0; kk < nloc; kk++, gid++) {
524         vval = (PetscScalar)(gid);
525         PetscCall(VecSetValues(vec, 1, &gid, &vval, INSERT_VALUES)); /* set with GID */
526       }
527       PetscCall(VecAssemblyBegin(vec));
528       PetscCall(VecAssemblyEnd(vec));
529       PetscCall(VecScatterBegin(mpimat->Mvctx, vec, mpimat->lvec, INSERT_VALUES, SCATTER_FORWARD));
530       PetscCall(VecScatterEnd(mpimat->Mvctx, vec, mpimat->lvec, INSERT_VALUES, SCATTER_FORWARD));
531       PetscCall(VecDestroy(&vec));
532       PetscCall(VecGetArrayRead(mpimat->lvec, &buf)); /* get proc ID in 'lghost_gid' */
533       PetscCall(PetscMalloc1(num_ghosts, &lghost_gid));
534       for (int kk = 0; kk < num_ghosts; kk++) lghost_gid[kk] = (PetscInt)PetscRealPart(buf[kk]);
535       PetscCall(VecRestoreArrayRead(mpimat->lvec, &buf));
536     }
537     // get 'comm_procs' (could hoist)
538     for (int kk = 0; kk < REQ_BF_SIZE; kk++) comm_procs[kk] = -1;
539     for (ix = 0, ncomm_procs = 0; ix < num_ghosts; ix++) {
540       PetscMPIInt proc = lghost_pe[ix], idx = -1;
541       for (int k = 0; k < ncomm_procs && idx == -1; k++)
542         if (comm_procs[k] == proc) idx = k;
543       if (idx == -1) { comm_procs[ncomm_procs++] = proc; }
544       PetscCheck(ncomm_procs != REQ_BF_SIZE, PETSC_COMM_SELF, PETSC_ERR_SUP, "Receive request array too small: %d", (int)ncomm_procs);
545     }
546     /* count edges, compute initial 'locMaxEdge', 'locMaxPE' */
547     nEdges0 = 0;
548     for (PetscInt kk = 0, gid = my0; kk < nloc; kk++, gid++) {
549       PetscReal   max_e = 0., tt;
550       PetscScalar vval;
551       PetscInt    lid = kk, max_pe = rank, pe, n;
552       ii = matA->i;
553       n  = ii[lid + 1] - ii[lid];
554       aj = matA->j + ii[lid];
555       ap = matA->a + ii[lid];
556       for (int jj = 0; jj < n; jj++) {
557         PetscInt lidj = aj[jj];
558         if ((tt = PetscRealPart(ap[jj])) > threshold && lidj != lid) {
559           if (tt > max_e) max_e = tt;
560           if (lidj > lid) nEdges0++;
561         }
562       }
563       if ((ix = lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */
564         ii = matB->compressedrow.i;
565         n  = ii[ix + 1] - ii[ix];
566         ap = matB->a + ii[ix];
567         aj = matB->j + ii[ix];
568         for (int jj = 0; jj < n; jj++) {
569           if ((tt = PetscRealPart(ap[jj])) > threshold) {
570             if (tt > max_e) max_e = tt;
571             nEdges0++;
572             if ((pe = lghost_pe[aj[jj]]) > max_pe) max_pe = pe;
573           }
574         }
575       }
576       vval = max_e;
577       PetscCall(VecSetValues(locMaxEdge, 1, &gid, &vval, INSERT_VALUES));
578       vval = (PetscScalar)max_pe;
579       PetscCall(VecSetValues(locMaxPE, 1, &gid, &vval, INSERT_VALUES));
580       if (iter == 0 && max_e <= MY_MEPS) { // add BCs to fake aggregate
581         lid_matched[lid] = PETSC_TRUE;
582         if (bc_agg == -1) {
583           bc_agg = lid;
584           PetscCall(PetscCDCreate(1, &bc_list));
585         }
586         PetscCall(PetscCDRemoveAllAt(agg_llists, lid));
587         PetscCall(PetscCDAppendID(bc_list, 0, my0 + lid));
588       }
589     }
590     PetscCall(VecAssemblyBegin(locMaxEdge));
591     PetscCall(VecAssemblyEnd(locMaxEdge));
592     PetscCall(VecAssemblyBegin(locMaxPE));
593     PetscCall(VecAssemblyEnd(locMaxPE));
594     /* make 'ghostMaxEdge_max_ew', 'lghost_max_pe' */
595     if (mpimat) {
596       const PetscScalar *buf;
597       PetscCall(VecDuplicate(mpimat->lvec, &ghostMaxEdge));
598       PetscCall(VecScatterBegin(mpimat->Mvctx, locMaxEdge, ghostMaxEdge, INSERT_VALUES, SCATTER_FORWARD));
599       PetscCall(VecScatterEnd(mpimat->Mvctx, locMaxEdge, ghostMaxEdge, INSERT_VALUES, SCATTER_FORWARD));
600 
601       PetscCall(VecDuplicate(mpimat->lvec, &ghostMaxPE));
602       PetscCall(VecScatterBegin(mpimat->Mvctx, locMaxPE, ghostMaxPE, INSERT_VALUES, SCATTER_FORWARD));
603       PetscCall(VecScatterEnd(mpimat->Mvctx, locMaxPE, ghostMaxPE, INSERT_VALUES, SCATTER_FORWARD));
604       PetscCall(VecGetArrayRead(ghostMaxPE, &buf));
605       PetscCall(PetscMalloc1(num_ghosts, &lghost_max_pe));
606       for (int kk = 0; kk < num_ghosts; kk++) lghost_max_pe[kk] = (PetscMPIInt)PetscRealPart(buf[kk]); // the MAX proc of the ghost now
607       PetscCall(VecRestoreArrayRead(ghostMaxPE, &buf));
608     }
609     { // make lid_max_pe
610       const PetscScalar *buf;
611       PetscCall(VecGetArrayRead(locMaxPE, &buf));
612       for (int kk = 0; kk < nloc; kk++) lid_max_pe[kk] = (PetscMPIInt)PetscRealPart(buf[kk]); // the MAX proc of the ghost now
613       PetscCall(VecRestoreArrayRead(locMaxPE, &buf));
614     }
615     /* setup sorted list of edges, and make 'Edges' */
616     PetscCall(PetscMalloc1(nEdges0, &Edges));
617     nEdges = 0;
618     for (int kk = 0, n; kk < nloc; kk++) {
619       const PetscInt lid = kk;
620       PetscReal      tt;
621       ii = matA->i;
622       n  = ii[lid + 1] - ii[lid];
623       aj = matA->j + ii[lid];
624       ap = matA->a + ii[lid];
625       for (int jj = 0; jj < n; jj++) {
626         PetscInt lidj = aj[jj];
627         if ((tt = PetscRealPart(ap[jj])) > threshold && lidj != lid) {
628           if (lidj > lid) {
629             Edges[nEdges].lid0       = lid;
630             Edges[nEdges].gid1       = lidj + my0;
631             Edges[nEdges].ghost1_idx = -1;
632             Edges[nEdges].weight     = tt;
633             nEdges++;
634           }
635         }
636       }
637       if ((ix = lid_cprowID[lid]) != -1) { /* if I have any ghost neighbor */
638         ii = matB->compressedrow.i;
639         n  = ii[ix + 1] - ii[ix];
640         ap = matB->a + ii[ix];
641         aj = matB->j + ii[ix];
642         for (int jj = 0; jj < n; jj++) {
643           if ((tt = PetscRealPart(ap[jj])) > threshold) {
644             Edges[nEdges].lid0       = lid;
645             Edges[nEdges].gid1       = lghost_gid[aj[jj]];
646             Edges[nEdges].ghost1_idx = aj[jj];
647             Edges[nEdges].weight     = tt;
648             nEdges++;
649           }
650         }
651       }
652     }
653     PetscCheck(nEdges == nEdges0, PETSC_COMM_SELF, PETSC_ERR_SUP, "nEdges != nEdges0: %d %d", (int)nEdges0, (int)nEdges);
654     qsort(Edges, nEdges, sizeof(Edge), gamg_hem_compare);
655 
656     PetscCall(PetscInfo(info_is, "[%d] start HEM iteration %d with number edges=%d\n", rank, iter, (int)nEdges));
657 
658     /* projection matrix */
659     PetscCall(MatCreate(comm, &P));
660     PetscCall(MatSetType(P, MATAIJ));
661     PetscCall(MatSetSizes(P, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE));
662     PetscCall(MatMPIAIJSetPreallocation(P, 1, NULL, 1, NULL));
663     PetscCall(MatSeqAIJSetPreallocation(P, 1, NULL));
664     PetscCall(MatSetUp(P));
665     /* process - communicate - process */
666     for (int sub_it = 0; /* sub_it < n_sub_its */; /* sub_it++ */) {
667       PetscInt    nactive_edges = 0, n_act_n[3], gn_act_n[3];
668       PetscMPIInt tag1, tag2;
669       PetscCall(VecGetArrayRead(locMaxEdge, &lid_max_ew));
670       if (isMPI) {
671         PetscCall(VecGetArrayRead(ghostMaxEdge, &lghost_max_ew));
672         PetscCall(PetscCommGetNewTag(comm, &tag1));
673         PetscCall(PetscCommGetNewTag(comm, &tag2));
674       }
675       for (int kk = 0; kk < nEdges; kk++) {
676         /* HEM */
677         const Edge    *e    = &Edges[kk];
678         const PetscInt lid0 = e->lid0, gid1 = e->gid1, ghost1_idx = e->ghost1_idx, gid0 = lid0 + my0, lid1 = gid1 - my0;
679         PetscBool      isOK = PETSC_TRUE, print = PETSC_FALSE;
680         if (print) PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD, "\t[%d] edge (%d %d), %d %d %d\n", rank, (int)gid0, (int)gid1, lid_matched[lid0], (ghost1_idx != -1 && lghost_matched[ghost1_idx]), (ghost1_idx == -1 && lid_matched[lid1])));
681         /* skip if either vertex is matched already */
682         if (lid_matched[lid0] || (ghost1_idx != -1 && lghost_matched[ghost1_idx]) || (ghost1_idx == -1 && lid_matched[lid1])) continue;
683 
684         nactive_edges++;
685         PetscCheck(PetscRealPart(lid_max_ew[lid0]) >= e->weight - MY_MEPS, PETSC_COMM_SELF, PETSC_ERR_SUP, "edge weight %e > max %e", (double)e->weight, (double)PetscRealPart(lid_max_ew[lid0]));
686         if (print) PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD, "\t[%d] active edge (%d %d), diff0 = %10.4e\n", rank, (int)gid0, (int)gid1, (double)(PetscRealPart(lid_max_ew[lid0]) - (double)e->weight)));
687         // smaller edge, lid_max_ew get updated - e0
688         if (PetscRealPart(lid_max_ew[lid0]) > e->weight + MY_MEPS) {
689           if (print)
690             PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD, "\t\t[%d] 1) e0 SKIPPING small edge %20.14e edge (%d %d), diff = %10.4e to proc %d. max = %20.14e, w = %20.14e\n", rank, (double)e->weight, (int)gid0, (int)gid1, (double)(PetscRealPart(lid_max_ew[lid0]) - e->weight), ghost1_idx != -1 ? (int)lghost_pe[ghost1_idx] : rank, (double)PetscRealPart(lid_max_ew[lid0]),
691                                               (double)e->weight));
692           continue; // we are basically filter edges here
693         }
694         // e1 - local
695         if (ghost1_idx == -1) {
696           if (PetscRealPart(lid_max_ew[lid1]) > e->weight + MY_MEPS) {
697             if (print)
698               PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD, "\t\t%c[%d] 2) e1 SKIPPING small local edge %20.14e edge (%d %d), diff = %10.4e\n", ghost1_idx != -1 ? '\t' : ' ', rank, (double)e->weight, (int)gid0, (int)gid1, (double)(PetscRealPart(lid_max_ew[lid1]) - e->weight)));
699             continue; // we are basically filter edges here
700           }
701         } else { // e1 - ghost
702           /* see if edge might get matched on other proc */
703           PetscReal g_max_e1 = PetscRealPart(lghost_max_ew[ghost1_idx]);
704           if (print)
705             PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD, "\t\t\t[%d] CHECK GHOST e1, edge (%d %d), E0 MAX EDGE WEIGHT = %10.4e, EDGE WEIGHT = %10.4e, diff1 = %10.4e, ghost proc %d with max pe %d on e0 and %d on e1\n", rank, (int)gid0, (int)gid1, (double)PetscRealPart(lid_max_ew[lid0]),
706                                               (double)e->weight, (double)(PetscRealPart(lghost_max_ew[ghost1_idx]) - e->weight), (int)lghost_pe[ghost1_idx], lid_max_pe[lid0], lghost_max_pe[ghost1_idx]));
707           if (g_max_e1 > e->weight + MY_MEPS) {
708             /* PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD,"\t\t\t\t[%d] 3) ghost e1 SKIPPING small edge (%d %d), diff = %10.4e from proc %d with max pe %d. max = %20.14e, w = %20.14e\n", rank, (int)gid0, (int)gid1, g_max_e1 - e->weight, (int)lghost_pe[ghost1_idx], lghost_max_pe[ghost1_idx], g_max_e1, e->weight )); */
709             continue;
710           } else if (g_max_e1 >= e->weight - MY_MEPS && lghost_max_pe[ghost1_idx] > rank) {
711             /* check for max_ea == to this edge and larger processor that will deal with this */
712             if (print)
713               PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD, "\t\t\t[%d] ghost e1 SKIPPING EQUAL (%d %d), diff = %10.4e from larger proc %d with max pe %d. max = %20.14e, w = %20.14e\n", rank, (int)gid0, (int)gid1,
714                                                 (double)(PetscRealPart(lid_max_ew[lid0]) - (double)e->weight), (int)lghost_pe[ghost1_idx], (int)lghost_max_pe[ghost1_idx], (double)g_max_e1, (double)e->weight));
715             isOK = PETSC_FALSE; // this guy could delete me
716             continue;
717           } else {
718             /* PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD,"\t[%d] Edge (%d %d) passes gid0 tests, diff = %10.4e from proc %d with max pe %d. max = %20.14e, w = %20.14e\n", rank, (int)gid0, (int)gid1, g_max_e1 - e->weight, (int)lghost_pe[ghost1_idx], lghost_max_pe[ghost1_idx], g_max_e1, e->weight )); */
719           }
720         }
721         /* check ghost for v0 */
722         if (isOK) {
723           PetscReal max_e, ew;
724           if ((ix = lid_cprowID[lid0]) != -1) { /* if I have any ghost neighbors */
725             int n;
726             ii = matB->compressedrow.i;
727             n  = ii[ix + 1] - ii[ix];
728             ap = matB->a + ii[ix];
729             aj = matB->j + ii[ix];
730             for (int jj = 0; jj < n && isOK; jj++) {
731               PetscInt lidj = aj[jj];
732               if (lghost_matched[lidj]) continue;
733               ew = PetscRealPart(ap[jj]);
734               if (ew <= threshold) continue;
735               max_e = PetscRealPart(lghost_max_ew[lidj]);
736               /* check for max_e == to this edge and larger processor that will deal with this */
737               if (ew >= PetscRealPart(lid_max_ew[lid0]) - MY_MEPS && lghost_pe[lidj] > rank) isOK = PETSC_FALSE;
738               PetscCheck(ew <= max_e + MY_MEPS, PETSC_COMM_SELF, PETSC_ERR_SUP, "edge weight %e > max %e", (double)PetscRealPart(ew), (double)PetscRealPart(max_e));
739               if (print)
740                 PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD, "\t\t\t\t[%d] e0: looked at ghost adj (%d %d), diff = %10.4e, ghost on proc %d (max %d). isOK = %d, %d %d %d; ew = %e, lid0 max ew = %e, diff = %e, eps = %e\n", rank, (int)gid0, (int)lghost_gid[lidj], (double)(max_e - ew), lghost_pe[lidj], lghost_max_pe[lidj], isOK, (double)(ew) >= (double)(max_e - MY_MEPS), ew >= PetscRealPart(lid_max_ew[lid0]) - MY_MEPS, lghost_pe[lidj] > rank, (double)ew, (double)PetscRealPart(lid_max_ew[lid0]), (double)(ew - PetscRealPart(lid_max_ew[lid0])), (double)MY_MEPS));
741             }
742             if (!isOK && print) PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD, "\t\t[%d] skip edge (%d %d) from ghost inspection\n", rank, (int)gid0, (int)gid1));
743           }
744           /* check local v1 */
745           if (ghost1_idx == -1) {
746             if ((ix = lid_cprowID[lid1]) != -1) { /* if I have any ghost neighbors */
747               int n;
748               ii = matB->compressedrow.i;
749               n  = ii[ix + 1] - ii[ix];
750               ap = matB->a + ii[ix];
751               aj = matB->j + ii[ix];
752               for (int jj = 0; jj < n && isOK; jj++) {
753                 PetscInt lidj = aj[jj];
754                 if (lghost_matched[lidj]) continue;
755                 ew = PetscRealPart(ap[jj]);
756                 if (ew <= threshold) continue;
757                 max_e = PetscRealPart(lghost_max_ew[lidj]);
758                 /* check for max_e == to this edge and larger processor that will deal with this */
759                 if (ew >= PetscRealPart(lid_max_ew[lid1]) - MY_MEPS && lghost_pe[lidj] > rank) isOK = PETSC_FALSE;
760                 PetscCheck(ew <= max_e + MY_MEPS, PETSC_COMM_SELF, PETSC_ERR_SUP, "edge weight %e > max %e", (double)PetscRealPart(ew), (double)PetscRealPart(max_e));
761                 if (print)
762                   PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD, "\t\t\t\t\t[%d] e1: looked at ghost adj (%d %d), diff = %10.4e, ghost on proc %d (max %d)\n", rank, (int)gid0, (int)lghost_gid[lidj], (double)(max_e - ew), lghost_pe[lidj], lghost_max_pe[lidj]));
763               }
764             }
765             if (!isOK && print) PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD, "\t\t[%d] skip edge (%d %d) from ghost inspection\n", rank, (int)gid0, (int)gid1));
766           }
767         }
768         PetscReal e1_max_w = (ghost1_idx == -1 ? PetscRealPart(lid_max_ew[lid0]) : PetscRealPart(lghost_max_ew[ghost1_idx]));
769         if (print)
770           PetscCall(PetscSynchronizedPrintf(PETSC_COMM_WORLD, "\t[%d] MATCHING (%d %d) e1 max weight = %e, e1 wight diff %e, %s. isOK = %d\n", rank, (int)gid0, (int)gid1, (double)e1_max_w, (double)(e1_max_w - e->weight), ghost1_idx == -1 ? "local" : "ghost", isOK));
771         /* do it */
772         if (isOK) {
773           if (ghost1_idx == -1) {
774             PetscCheck(!lid_matched[lid1], PETSC_COMM_SELF, PETSC_ERR_SUP, "local %d is matched", (int)gid1);
775             lid_matched[lid1] = PETSC_TRUE;                       /* keep track of what we've done this round */
776             PetscCall(PetscCDMoveAppend(agg_llists, lid0, lid1)); // takes lid1's list and appends to lid0's
777           } else {
778             /* add gid1 to list of ghost deleted by me -- I need their children */
779             PetscMPIInt proc = lghost_pe[ghost1_idx];
780             PetscCheck(!lghost_matched[ghost1_idx], PETSC_COMM_SELF, PETSC_ERR_SUP, "ghost %d is matched", (int)lghost_gid[ghost1_idx]);
781             lghost_matched[ghost1_idx] = PETSC_TRUE;
782             PetscCall(PetscCDAppendID(ghost_deleted_list, proc, ghost1_idx)); /* cache to send messages */
783             PetscCall(PetscCDAppendID(ghost_deleted_list, proc, lid0));
784           }
785           lid_matched[lid0] = PETSC_TRUE; /* keep track of what we've done this round */
786           /* set projection */
787           PetscCall(MatSetValues(P, 1, &gid0, 1, &gid0, &one, INSERT_VALUES));
788           PetscCall(MatSetValues(P, 1, &gid1, 1, &gid0, &one, INSERT_VALUES));
789           //PetscCall(PetscPrintf(PETSC_COMM_WORLD,"\t %d.%d) match active EDGE %d : (%d %d)\n",iter,sub_it, (int)nactive_edges, (int)gid0, (int)gid1));
790         } /* matched */
791       }   /* edge loop */
792       PetscCall(PetscSynchronizedFlush(PETSC_COMM_WORLD, PETSC_STDOUT));
793       if (isMPI) PetscCall(VecRestoreArrayRead(ghostMaxEdge, &lghost_max_ew));
794       PetscCall(VecRestoreArrayRead(locMaxEdge, &lid_max_ew));
795       // count active for test, latter, update deleted ghosts
796       n_act_n[0] = nactive_edges;
797       if (ghost_deleted_list) PetscCall(PetscCDCount(ghost_deleted_list, &n_act_n[2]));
798       else n_act_n[2] = 0;
799       PetscCall(PetscCDCount(agg_llists, &n_act_n[1]));
800       PetscCall(MPIU_Allreduce(n_act_n, gn_act_n, 3, MPIU_INT, MPI_SUM, comm));
801       PetscCall(PetscInfo(info_is, "[%d] %d.%d) nactive edges=%" PetscInt_FMT ", ncomm_procs=%d, nEdges=%d, %" PetscInt_FMT " deleted ghosts, N=%" PetscInt_FMT "\n", rank, iter, sub_it, gn_act_n[0], (int)ncomm_procs, (int)nEdges, gn_act_n[2], gn_act_n[1]));
802       /* deal with deleted ghost */
803       if (isMPI) {
804         PetscCDIntNd *pos;
805         PetscInt     *sbuffs1[REQ_BF_SIZE], ndel;
806         PetscInt     *sbuffs2[REQ_BF_SIZE];
807         MPI_Status    status;
808 
809         /* send deleted ghosts */
810         for (int proc_idx = 0; proc_idx < ncomm_procs; proc_idx++) {
811           const PetscMPIInt proc = comm_procs[proc_idx];
812           PetscInt         *sbuff, *pt, scount;
813           MPI_Request      *request;
814           /* count ghosts */
815           PetscCall(PetscCDCountAt(ghost_deleted_list, proc, &ndel));
816           ndel /= 2; // two entries for each proc
817           scount = 2 + 2 * ndel;
818           PetscCall(PetscMalloc1(scount + request_size, &sbuff));
819           /* save requests */
820           sbuffs1[proc_idx] = sbuff;
821           request           = (MPI_Request *)sbuff;
822           sbuff = pt = (PetscInt *)(sbuff + request_size);
823           /* write [ndel, proc, n*[gid1,gid0] */
824           *pt++ = ndel; // number of deleted to send
825           *pt++ = rank; // proc (not used)
826           PetscCall(PetscCDGetHeadPos(ghost_deleted_list, proc, &pos));
827           while (pos) {
828             PetscInt lid0, ghost_idx, gid1;
829             PetscCall(PetscCDIntNdGetID(pos, &ghost_idx));
830             gid1 = lghost_gid[ghost_idx];
831             PetscCall(PetscCDGetNextPos(ghost_deleted_list, proc, &pos));
832             PetscCall(PetscCDIntNdGetID(pos, &lid0));
833             PetscCall(PetscCDGetNextPos(ghost_deleted_list, proc, &pos));
834             *pt++ = gid1;
835             *pt++ = lid0 + my0; // gid0
836           }
837           PetscCheck(pt - sbuff == scount, PETSC_COMM_SELF, PETSC_ERR_SUP, "sbuff-pt != scount: %d", (int)(pt - sbuff));
838           /* MPI_Isend:  tag1 [ndel, proc, n*[gid1,gid0] ] */
839           PetscCallMPI(MPI_Isend(sbuff, scount, MPIU_INT, proc, tag1, comm, request));
840           PetscCall(PetscCDRemoveAllAt(ghost_deleted_list, proc)); // done with this list
841         }
842         /* receive deleted, send back partial aggregates, clear lists */
843         for (int proc_idx = 0; proc_idx < ncomm_procs; proc_idx++) {
844           PetscCallMPI(MPI_Probe(comm_procs[proc_idx] /* MPI_ANY_SOURCE */, tag1, comm, &status));
845           {
846             PetscInt         *pt, *pt2, *pt3, *sbuff, tmp;
847             MPI_Request      *request;
848             int               rcount, scount, ndel;
849             const PetscMPIInt proc = status.MPI_SOURCE;
850             PetscCallMPI(MPI_Get_count(&status, MPIU_INT, &rcount));
851             if (rcount > rbuff_sz) {
852               if (rbuff) PetscCall(PetscFree(rbuff));
853               PetscCall(PetscMalloc1(rcount, &rbuff));
854               rbuff_sz = rcount;
855             }
856             /* MPI_Recv: tag1 [ndel, proc, ndel*[gid1,gid0] ] */
857             PetscCallMPI(MPI_Recv(rbuff, rcount, MPIU_INT, proc, tag1, comm, &status));
858             /* read and count sends *[lid0, n, n*[gid] ] */
859             pt     = rbuff;
860             scount = 0;
861             ndel   = *pt++; // number of deleted to recv
862             tmp    = *pt++; // proc (not used)
863             while (ndel--) {
864               PetscInt gid1 = *pt++, lid1 = gid1 - my0;
865               int      gh_gid0 = *pt++; // gid on other proc (not used here to count)
866               PetscCheck(lid1 >= 0 && lid1 < nloc, PETSC_COMM_SELF, PETSC_ERR_SUP, "received ghost deleted %d", (int)gid1);
867               PetscCheck(!lid_matched[lid1], PETSC_COMM_SELF, PETSC_ERR_PLIB, "%d) received matched local gid %" PetscInt_FMT ",%d, with ghost (lid) %" PetscInt_FMT " from proc %d", sub_it, gid1, gh_gid0, tmp, proc);
868               lid_matched[lid1] = PETSC_TRUE;                    /* keep track of what we've done this round */
869               PetscCall(PetscCDCountAt(agg_llists, lid1, &tmp)); // n
870               /* PetscCheck(tmp == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "sending %d (!= 1) size aggregate. gid-0 %d, from %d (gid-1 %d)", (int)tmp, (int) gid, proc, gh_gid0); */
871               scount += tmp + 2; // lid0, n, n*[gid]
872             }
873             PetscCheck((pt - rbuff) == rcount, PETSC_COMM_SELF, PETSC_ERR_SUP, "receive buffer size != num read: %d; rcount: %d", (int)(pt - rbuff), rcount);
874             /* send tag2: *[gid0, n, n*[gid] ] */
875             PetscCall(PetscMalloc1(scount + request_size, &sbuff));
876             sbuffs2[proc_idx] = sbuff; /* cache request */
877             request           = (MPI_Request *)sbuff;
878             pt2 = sbuff = sbuff + request_size;
879             // read again: n, proc, n*[gid1,gid0]
880             pt   = rbuff;
881             ndel = *pt++;
882             tmp  = *pt++; // proc (not used)
883             while (ndel--) {
884               PetscInt gid1 = *pt++, lid1 = gid1 - my0, gh_gid0 = *pt++;
885               /* write [gid0, aggSz, aggSz[gid] ] */
886               *pt2++ = gh_gid0;
887               pt3    = pt2++; /* save pointer for later */
888               PetscCall(PetscCDGetHeadPos(agg_llists, lid1, &pos));
889               while (pos) {
890                 PetscInt gid;
891                 PetscCall(PetscCDIntNdGetID(pos, &gid));
892                 PetscCall(PetscCDGetNextPos(agg_llists, lid1, &pos));
893                 *pt2++ = gid;
894               }
895               *pt3 = (pt2 - pt3) - 1;
896               /* clear list */
897               PetscCall(PetscCDRemoveAllAt(agg_llists, lid1));
898             }
899             PetscCheck((pt2 - sbuff) == scount, PETSC_COMM_SELF, PETSC_ERR_SUP, "buffer size != num write: %d %d", (int)(pt2 - sbuff), (int)scount);
900             /* MPI_Isend: requested data tag2 *[lid0, n, n*[gid1] ] */
901             PetscCallMPI(MPI_Isend(sbuff, scount, MPIU_INT, proc, tag2, comm, request));
902           }
903         } // proc_idx
904         /* receive tag2 *[gid0, n, n*[gid] ] */
905         for (int proc_idx = 0; proc_idx < ncomm_procs; proc_idx++) {
906           PetscMPIInt proc;
907           PetscInt   *pt;
908           int         rcount;
909           PetscCallMPI(MPI_Probe(comm_procs[proc_idx] /* MPI_ANY_SOURCE */, tag2, comm, &status));
910           PetscCallMPI(MPI_Get_count(&status, MPIU_INT, &rcount));
911           if (rcount > rbuff_sz) {
912             if (rbuff) PetscCall(PetscFree(rbuff));
913             PetscCall(PetscMalloc1(rcount, &rbuff));
914             rbuff_sz = rcount;
915           }
916           proc = status.MPI_SOURCE;
917           /* MPI_Recv:  tag1 [n, proc, n*[gid1,lid0] ] */
918           PetscCallMPI(MPI_Recv(rbuff, rcount, MPIU_INT, proc, tag2, comm, &status));
919           pt = rbuff;
920           while (pt - rbuff < rcount) {
921             PetscInt gid0 = *pt++, n = *pt++;
922             while (n--) {
923               PetscInt gid1 = *pt++;
924               PetscCall(PetscCDAppendID(agg_llists, gid0 - my0, gid1));
925             }
926           }
927           PetscCheck((pt - rbuff) == rcount, PETSC_COMM_SELF, PETSC_ERR_SUP, "recv buffer size != num read: %d %d", (int)(pt - rbuff), (int)rcount);
928         }
929         /* wait for tag1 isends */
930         for (int proc_idx = 0; proc_idx < ncomm_procs; proc_idx++) {
931           MPI_Request *request;
932           request = (MPI_Request *)sbuffs1[proc_idx];
933           PetscCallMPI(MPI_Wait(request, &status));
934           PetscCall(PetscFree(sbuffs1[proc_idx]));
935         }
936         /* wait for tag2 isends */
937         for (int proc_idx = 0; proc_idx < ncomm_procs; proc_idx++) {
938           MPI_Request *request = (MPI_Request *)sbuffs2[proc_idx];
939           PetscCallMPI(MPI_Wait(request, &status));
940           PetscCall(PetscFree(sbuffs2[proc_idx]));
941         }
942       } /* MPI */
943       /* set 'lghost_matched' - use locMaxEdge, ghostMaxEdge (recomputed next) */
944       if (isMPI) {
945         const PetscScalar *sbuff;
946         for (PetscInt kk = 0, gid = my0; kk < nloc; kk++, gid++) {
947           PetscScalar vval = lid_matched[kk] ? 1.0 : 0.0;
948           PetscCall(VecSetValues(locMaxEdge, 1, &gid, &vval, INSERT_VALUES)); /* set with GID */
949         }
950         PetscCall(VecAssemblyBegin(locMaxEdge));
951         PetscCall(VecAssemblyEnd(locMaxEdge));
952         PetscCall(VecScatterBegin(mpimat->Mvctx, locMaxEdge, ghostMaxEdge, INSERT_VALUES, SCATTER_FORWARD));
953         PetscCall(VecScatterEnd(mpimat->Mvctx, locMaxEdge, ghostMaxEdge, INSERT_VALUES, SCATTER_FORWARD));
954         PetscCall(VecGetArrayRead(ghostMaxEdge, &sbuff));
955         for (int kk = 0; kk < num_ghosts; kk++) { lghost_matched[kk] = (PetscBool)(PetscRealPart(sbuff[kk]) != 0.0); }
956         PetscCall(VecRestoreArrayRead(ghostMaxEdge, &sbuff));
957       }
958       /* compute 'locMaxEdge' inside sub iteration b/c max weight can drop as neighbors are matched */
959       for (PetscInt kk = 0, gid = my0; kk < nloc; kk++, gid++) {
960         PetscReal      max_e = 0., tt;
961         PetscScalar    vval;
962         const PetscInt lid    = kk;
963         int            max_pe = rank, pe, n;
964         ii                    = matA->i;
965         n                     = ii[lid + 1] - ii[lid];
966         aj                    = matA->j + ii[lid];
967         ap                    = matA->a + ii[lid];
968         for (int jj = 0; jj < n; jj++) {
969           PetscInt lidj = aj[jj];
970           if (lid_matched[lidj]) continue; /* this is new - can change local max */
971           if (lidj != lid && PetscRealPart(ap[jj]) > max_e) max_e = PetscRealPart(ap[jj]);
972         }
973         if (lid_cprowID && (ix = lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */
974           ii = matB->compressedrow.i;
975           n  = ii[ix + 1] - ii[ix];
976           ap = matB->a + ii[ix];
977           aj = matB->j + ii[ix];
978           for (int jj = 0; jj < n; jj++) {
979             PetscInt lidj = aj[jj];
980             if (lghost_matched[lidj]) continue;
981             if ((tt = PetscRealPart(ap[jj])) > max_e) max_e = tt;
982           }
983         }
984         vval = (PetscScalar)max_e;
985         PetscCall(VecSetValues(locMaxEdge, 1, &gid, &vval, INSERT_VALUES)); /* set with GID */
986         // max PE with max edge
987         if (lid_cprowID && (ix = lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */
988           ii = matB->compressedrow.i;
989           n  = ii[ix + 1] - ii[ix];
990           ap = matB->a + ii[ix];
991           aj = matB->j + ii[ix];
992           for (int jj = 0; jj < n; jj++) {
993             PetscInt lidj = aj[jj];
994             if (lghost_matched[lidj]) continue;
995             if ((pe = lghost_pe[aj[jj]]) > max_pe && PetscRealPart(ap[jj]) >= max_e - MY_MEPS) { max_pe = pe; }
996           }
997         }
998         vval = (PetscScalar)max_pe;
999         PetscCall(VecSetValues(locMaxPE, 1, &gid, &vval, INSERT_VALUES));
1000       }
1001       PetscCall(VecAssemblyBegin(locMaxEdge));
1002       PetscCall(VecAssemblyEnd(locMaxEdge));
1003       PetscCall(VecAssemblyBegin(locMaxPE));
1004       PetscCall(VecAssemblyEnd(locMaxPE));
1005       /* compute 'lghost_max_ew' and 'lghost_max_pe' to get ready for next iteration*/
1006       if (isMPI) {
1007         const PetscScalar *buf;
1008         PetscCall(VecScatterBegin(mpimat->Mvctx, locMaxEdge, ghostMaxEdge, INSERT_VALUES, SCATTER_FORWARD));
1009         PetscCall(VecScatterEnd(mpimat->Mvctx, locMaxEdge, ghostMaxEdge, INSERT_VALUES, SCATTER_FORWARD));
1010         PetscCall(VecScatterBegin(mpimat->Mvctx, locMaxPE, ghostMaxPE, INSERT_VALUES, SCATTER_FORWARD));
1011         PetscCall(VecScatterEnd(mpimat->Mvctx, locMaxPE, ghostMaxPE, INSERT_VALUES, SCATTER_FORWARD));
1012         PetscCall(VecGetArrayRead(ghostMaxPE, &buf));
1013         for (int kk = 0; kk < num_ghosts; kk++) {
1014           lghost_max_pe[kk] = (PetscMPIInt)PetscRealPart(buf[kk]); // the MAX proc of the ghost now
1015         }
1016         PetscCall(VecRestoreArrayRead(ghostMaxPE, &buf));
1017       }
1018       // if no active edges, stop
1019       if (gn_act_n[0] < 1) break;
1020       // inc and check (self stopping iteration
1021       sub_it++;
1022       PetscCheck(sub_it < n_sub_its, PETSC_COMM_SELF, PETSC_ERR_SUP, "failed to finish HEM step %d/%d", sub_it + 1, n_sub_its);
1023     } /* sub_it loop */
1024     /* clean up iteration */
1025     PetscCall(PetscFree(Edges));
1026     if (mpimat) { // can be hoisted
1027       PetscCall(VecRestoreArrayRead(ghostMaxEdge, &lghost_max_ew));
1028       PetscCall(VecDestroy(&ghostMaxEdge));
1029       PetscCall(VecDestroy(&ghostMaxPE));
1030       PetscCall(PetscFree(lghost_pe));
1031       PetscCall(PetscFree(lghost_gid));
1032       PetscCall(PetscFree(lghost_matched));
1033       PetscCall(PetscFree(lghost_max_pe));
1034     }
1035     PetscCall(VecDestroy(&locMaxEdge));
1036     PetscCall(VecDestroy(&locMaxPE));
1037     /* create next graph */
1038     {
1039       Vec diag;
1040       /* add identity for unmatched vertices so they stay alive */
1041       for (PetscInt kk = 0, gid1, gid = my0; kk < nloc; kk++, gid++) {
1042         if (!lid_matched[kk]) {
1043           const PetscInt lid = kk;
1044           PetscCDIntNd  *pos;
1045           PetscCall(PetscCDGetHeadPos(agg_llists, lid, &pos));
1046           PetscCheck(pos, PETSC_COMM_SELF, PETSC_ERR_PLIB, "empty list in singleton: %d", (int)gid);
1047           PetscCall(PetscCDIntNdGetID(pos, &gid1));
1048           PetscCheck(gid1 == gid, PETSC_COMM_SELF, PETSC_ERR_PLIB, "first in list (%d) in singleton not %d", (int)gid1, (int)gid);
1049           PetscCall(MatSetValues(P, 1, &gid, 1, &gid, &one, INSERT_VALUES));
1050         }
1051       }
1052       PetscCall(MatAssemblyBegin(P, MAT_FINAL_ASSEMBLY));
1053       PetscCall(MatAssemblyEnd(P, MAT_FINAL_ASSEMBLY));
1054 
1055       /* project to make new graph with collapsed edges */
1056       PetscCall(MatPtAP(cMat, P, MAT_INITIAL_MATRIX, 1.0, &tMat));
1057       PetscCall(MatDestroy(&P));
1058       PetscCall(MatDestroy(&cMat));
1059       cMat = tMat;
1060       PetscCall(MatCreateVecs(cMat, &diag, NULL));
1061       PetscCall(MatGetDiagonal(cMat, diag));
1062       PetscCall(VecReciprocal(diag));
1063       PetscCall(VecSqrtAbs(diag));
1064       PetscCall(MatDiagonalScale(cMat, diag, diag));
1065       PetscCall(VecDestroy(&diag));
1066     }
1067   } /* coarsen iterator */
1068 
1069   /* make fake matrix with Mat->B only for smoothed agg QR. Need this if we make an aux graph (ie, PtAP) with k > 1 */
1070   if (size > 1) {
1071     Mat           mat;
1072     PetscCDIntNd *pos;
1073     PetscInt      NN, MM, jj = 0, mxsz = 0;
1074 
1075     for (int kk = 0; kk < nloc; kk++) {
1076       PetscCall(PetscCDCountAt(agg_llists, kk, &jj));
1077       if (jj > mxsz) mxsz = jj;
1078     }
1079     PetscCall(MatGetSize(a_Gmat, &MM, &NN));
1080     if (mxsz > MM - nloc) mxsz = MM - nloc;
1081     /* matrix of ghost adj for square graph */
1082     PetscCall(MatCreateAIJ(comm, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE, 0, NULL, mxsz, NULL, &mat));
1083     for (PetscInt lid = 0, gid = my0; lid < nloc; lid++, gid++) {
1084       PetscCall(PetscCDGetHeadPos(agg_llists, lid, &pos));
1085       while (pos) {
1086         PetscInt gid1;
1087         PetscCall(PetscCDIntNdGetID(pos, &gid1));
1088         PetscCall(PetscCDGetNextPos(agg_llists, lid, &pos));
1089         if (gid1 < my0 || gid1 >= my0 + nloc) PetscCall(MatSetValues(mat, 1, &gid, 1, &gid1, &one, ADD_VALUES));
1090       }
1091     }
1092     PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
1093     PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
1094     PetscCall(PetscCDSetMat(agg_llists, mat));
1095     PetscCall(PetscCDDestroy(ghost_deleted_list));
1096     if (rbuff_sz) PetscCall(PetscFree(rbuff)); // always true
1097   }
1098   // move BCs into some node
1099   if (bc_list) {
1100     PetscCDIntNd *pos;
1101     PetscCall(PetscCDGetHeadPos(bc_list, 0, &pos));
1102     while (pos) {
1103       PetscInt gid1;
1104       PetscCall(PetscCDIntNdGetID(pos, &gid1));
1105       PetscCall(PetscCDGetNextPos(bc_list, 0, &pos));
1106       PetscCall(PetscCDAppendID(agg_llists, bc_agg, gid1));
1107     }
1108     PetscCall(PetscCDRemoveAllAt(bc_list, 0));
1109     PetscCall(PetscCDDestroy(bc_list));
1110   }
1111   {
1112     // check sizes -- all vertices must get in graph
1113     PetscInt sz, globalsz, MM;
1114     PetscCall(MatGetSize(a_Gmat, &MM, NULL));
1115     PetscCall(PetscCDCount(agg_llists, &sz));
1116     PetscCall(MPIU_Allreduce(&sz, &globalsz, 1, MPIU_INT, MPI_SUM, comm));
1117     PetscCheck(MM == globalsz, comm, PETSC_ERR_SUP, "lost %d equations ?", (int)(MM - globalsz));
1118   }
1119   // cleanup
1120   PetscCall(MatDestroy(&cMat));
1121   PetscCall(PetscFree(lid_cprowID));
1122   PetscCall(PetscFree(lid_max_pe));
1123   PetscCall(PetscFree(lid_matched));
1124   PetscCall(ISDestroy(&info_is));
1125 
1126   PetscFunctionReturn(PETSC_SUCCESS);
1127 }
1128 
1129 /*
1130    HEM coarsen, simple greedy.
1131 */
1132 static PetscErrorCode MatCoarsenApply_HEM(MatCoarsen coarse)
1133 {
1134   Mat mat = coarse->graph;
1135 
1136   PetscFunctionBegin;
1137   PetscCall(MatCoarsenApply_HEM_private(mat, coarse->max_it, coarse->threshold, &coarse->agg_lists));
1138 
1139   PetscFunctionReturn(PETSC_SUCCESS);
1140 }
1141 
1142 static PetscErrorCode MatCoarsenView_HEM(MatCoarsen coarse, PetscViewer viewer)
1143 {
1144   PetscMPIInt rank;
1145   PetscBool   iascii;
1146 
1147   PetscFunctionBegin;
1148   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)coarse), &rank));
1149   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1150   if (iascii) {
1151     PetscCDIntNd *pos, *pos2;
1152     PetscCall(PetscViewerASCIIPrintf(viewer, "%d matching steps with threshold = %g\n", (int)coarse->max_it, (double)coarse->threshold));
1153     PetscCall(PetscViewerASCIIPushSynchronized(viewer));
1154     for (PetscInt kk = 0; kk < coarse->agg_lists->size; kk++) {
1155       PetscCall(PetscCDGetHeadPos(coarse->agg_lists, kk, &pos));
1156       if ((pos2 = pos)) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "selected local %d: ", (int)kk));
1157       while (pos) {
1158         PetscInt gid1;
1159         PetscCall(PetscCDIntNdGetID(pos, &gid1));
1160         PetscCall(PetscCDGetNextPos(coarse->agg_lists, kk, &pos));
1161         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, " %d ", (int)gid1));
1162       }
1163       if (pos2) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "\n"));
1164     }
1165     PetscCall(PetscViewerFlush(viewer));
1166     PetscCall(PetscViewerASCIIPopSynchronized(viewer));
1167   }
1168   PetscFunctionReturn(PETSC_SUCCESS);
1169 }
1170 
1171 /*
1172   MatCoarsenCreate_HEM - A coarsener that uses HEM a simple greedy coarsener
1173 */
1174 PETSC_EXTERN PetscErrorCode MatCoarsenCreate_HEM(MatCoarsen coarse)
1175 {
1176   PetscFunctionBegin;
1177   coarse->ops->apply = MatCoarsenApply_HEM;
1178   coarse->ops->view  = MatCoarsenView_HEM;
1179   coarse->max_it     = 4;
1180   PetscFunctionReturn(PETSC_SUCCESS);
1181 }
1182