xref: /petsc/src/mat/graphops/coarsen/impls/hem/hem.c (revision d9acb416d05abeed0a33bde3a81aeb2ea0364f6a)
1 #include <petsc/private/matimpl.h> /*I "petscmat.h" I*/
2 #include <../src/mat/impls/aij/seq/aij.h>
3 #include <../src/mat/impls/aij/mpi/mpiaij.h>
4 #include <petscdm.h>
5 
6 /* linked list methods
7  *
8  *  PetscCDCreate
9  */
10 PetscErrorCode PetscCDCreate(PetscInt a_size, PetscCoarsenData **a_out)
11 {
12   PetscCoarsenData *ail;
13 
14   PetscFunctionBegin;
15   /* allocate pool, partially */
16   PetscCall(PetscNew(&ail));
17   *a_out               = ail;
18   ail->pool_list.next  = NULL;
19   ail->pool_list.array = NULL;
20   ail->chk_sz          = 0;
21   /* allocate array */
22   ail->size = a_size;
23   PetscCall(PetscCalloc1(a_size, &ail->array));
24   ail->extra_nodes = NULL;
25   ail->mat         = NULL;
26   PetscFunctionReturn(PETSC_SUCCESS);
27 }
28 
29 /* PetscCDDestroy
30  */
31 PetscErrorCode PetscCDDestroy(PetscCoarsenData *ail)
32 {
33   PetscCDArrNd *n = &ail->pool_list;
34 
35   PetscFunctionBegin;
36   n = n->next;
37   while (n) {
38     PetscCDArrNd *lstn = n;
39     n                  = n->next;
40     PetscCall(PetscFree(lstn));
41   }
42   if (ail->pool_list.array) PetscCall(PetscFree(ail->pool_list.array));
43   PetscCall(PetscFree(ail->array));
44   if (ail->mat) PetscCall(MatDestroy(&ail->mat));
45   /* delete this (+agg+pool array) */
46   PetscCall(PetscFree(ail));
47   PetscFunctionReturn(PETSC_SUCCESS);
48 }
49 
50 /* PetscCDSetChunkSize
51  */
52 PetscErrorCode PetscCDSetChunkSize(PetscCoarsenData *ail, PetscInt a_sz)
53 {
54   PetscFunctionBegin;
55   ail->chk_sz = a_sz;
56   PetscFunctionReturn(PETSC_SUCCESS);
57 }
58 
59 /*  PetscCDGetNewNode
60  */
61 static PetscErrorCode PetscCDGetNewNode(PetscCoarsenData *ail, PetscCDIntNd **a_out, PetscInt a_id)
62 {
63   PetscFunctionBegin;
64   *a_out = NULL; /* squelch -Wmaybe-uninitialized */
65   if (ail->extra_nodes) {
66     PetscCDIntNd *node = ail->extra_nodes;
67     ail->extra_nodes   = node->next;
68     node->gid          = a_id;
69     node->next         = NULL;
70     *a_out             = node;
71   } else {
72     if (!ail->pool_list.array) {
73       if (!ail->chk_sz) ail->chk_sz = 10; /* use a chuck size of ail->size? */
74       PetscCall(PetscMalloc1(ail->chk_sz, &ail->pool_list.array));
75       ail->new_node       = ail->pool_list.array;
76       ail->new_left       = ail->chk_sz;
77       ail->new_node->next = NULL;
78     } else if (!ail->new_left) {
79       PetscCDArrNd *node;
80       PetscCall(PetscMalloc(ail->chk_sz * sizeof(PetscCDIntNd) + sizeof(PetscCDArrNd), &node));
81       node->array         = (PetscCDIntNd *)(node + 1);
82       node->next          = ail->pool_list.next;
83       ail->pool_list.next = node;
84       ail->new_left       = ail->chk_sz;
85       ail->new_node       = node->array;
86     }
87     ail->new_node->gid  = a_id;
88     ail->new_node->next = NULL;
89     *a_out              = ail->new_node++;
90     ail->new_left--;
91   }
92   PetscFunctionReturn(PETSC_SUCCESS);
93 }
94 
95 /* PetscCDIntNdSetID
96  */
97 PetscErrorCode PetscCDIntNdSetID(PetscCDIntNd *a_this, PetscInt a_id)
98 {
99   PetscFunctionBegin;
100   a_this->gid = a_id;
101   PetscFunctionReturn(PETSC_SUCCESS);
102 }
103 
104 /* PetscCDIntNdGetID
105  */
106 PetscErrorCode PetscCDIntNdGetID(const PetscCDIntNd *a_this, PetscInt *a_gid)
107 {
108   PetscFunctionBegin;
109   *a_gid = a_this->gid;
110   PetscFunctionReturn(PETSC_SUCCESS);
111 }
112 
113 /* PetscCDGetHeadPos
114  */
115 PetscErrorCode PetscCDGetHeadPos(const PetscCoarsenData *ail, PetscInt a_idx, PetscCDIntNd **pos)
116 {
117   PetscFunctionBegin;
118   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "a_idx >= ail->size: a_idx=%" PetscInt_FMT ".", a_idx);
119   *pos = ail->array[a_idx];
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 /* PetscCDGetNextPos
124  */
125 PetscErrorCode PetscCDGetNextPos(const PetscCoarsenData *ail, PetscInt l_idx, PetscCDIntNd **pos)
126 {
127   PetscFunctionBegin;
128   PetscCheck((*pos), PETSC_COMM_SELF, PETSC_ERR_PLIB, "NULL input position.");
129   *pos = (*pos)->next;
130   PetscFunctionReturn(PETSC_SUCCESS);
131 }
132 
133 /* PetscCDAppendID
134  */
135 PetscErrorCode PetscCDAppendID(PetscCoarsenData *ail, PetscInt a_idx, PetscInt a_id)
136 {
137   PetscCDIntNd *n, *n2;
138 
139   PetscFunctionBegin;
140   PetscCall(PetscCDGetNewNode(ail, &n, a_id));
141   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_idx);
142   if (!(n2 = ail->array[a_idx])) ail->array[a_idx] = n;
143   else {
144     do {
145       if (!n2->next) {
146         n2->next = n;
147         PetscCheck(!n->next, PETSC_COMM_SELF, PETSC_ERR_PLIB, "n should not have a next");
148         break;
149       }
150       n2 = n2->next;
151     } while (n2);
152     PetscCheck(n2, PETSC_COMM_SELF, PETSC_ERR_PLIB, "n2 should be non-null");
153   }
154   PetscFunctionReturn(PETSC_SUCCESS);
155 }
156 
157 /* PetscCDAppendNode
158  */
159 PetscErrorCode PetscCDAppendNode(PetscCoarsenData *ail, PetscInt a_idx, PetscCDIntNd *a_n)
160 {
161   PetscCDIntNd *n2;
162 
163   PetscFunctionBegin;
164   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_idx);
165   if (!(n2 = ail->array[a_idx])) ail->array[a_idx] = a_n;
166   else {
167     do {
168       if (!n2->next) {
169         n2->next  = a_n;
170         a_n->next = NULL;
171         break;
172       }
173       n2 = n2->next;
174     } while (n2);
175     PetscCheck(n2, PETSC_COMM_SELF, PETSC_ERR_PLIB, "n2 should be non-null");
176   }
177   PetscFunctionReturn(PETSC_SUCCESS);
178 }
179 
180 /* PetscCDRemoveNextNode: a_last->next, this exposes single linked list structure to API (not used)
181  */
182 PetscErrorCode PetscCDRemoveNextNode(PetscCoarsenData *ail, PetscInt a_idx, PetscCDIntNd *a_last)
183 {
184   PetscCDIntNd *del;
185 
186   PetscFunctionBegin;
187   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_idx);
188   PetscCheck(a_last->next, PETSC_COMM_SELF, PETSC_ERR_PLIB, "a_last should have a next");
189   del          = a_last->next;
190   a_last->next = del->next;
191   /* del->next = NULL; -- this still used in a iterator so keep it intact -- need to fix this with a double linked list */
192   /* could reuse n2 but PetscCDAppendNode sometimes uses it */
193   PetscFunctionReturn(PETSC_SUCCESS);
194 }
195 
196 /* PetscCDPrint
197  */
198 PetscErrorCode PetscCDPrint(const PetscCoarsenData *ail, PetscInt my0, MPI_Comm comm)
199 {
200   PetscCDIntNd *n, *n2;
201   PetscInt      ii;
202 
203   PetscFunctionBegin;
204   for (ii = 0; ii < ail->size; ii++) {
205     n2 = n = ail->array[ii];
206     if (n) PetscCall(PetscSynchronizedPrintf(comm, "list %" PetscInt_FMT ":", ii + my0));
207     while (n) {
208       PetscCall(PetscSynchronizedPrintf(comm, " %" PetscInt_FMT, n->gid));
209       n = n->next;
210     }
211     if (n2) PetscCall(PetscSynchronizedPrintf(comm, "\n"));
212   }
213   PetscCall(PetscSynchronizedFlush(comm, PETSC_STDOUT));
214   PetscFunctionReturn(PETSC_SUCCESS);
215 }
216 
217 /* PetscCDMoveAppend - take list in a_srcidx and appends to destidx
218  */
219 PetscErrorCode PetscCDMoveAppend(PetscCoarsenData *ail, PetscInt a_destidx, PetscInt a_srcidx)
220 {
221   PetscCDIntNd *n;
222 
223   PetscFunctionBegin;
224   PetscCheck(a_srcidx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_srcidx);
225   PetscCheck(a_destidx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_destidx);
226   PetscCheck(a_destidx != a_srcidx, PETSC_COMM_SELF, PETSC_ERR_PLIB, "a_destidx==a_srcidx %" PetscInt_FMT ".", a_destidx);
227   n = ail->array[a_destidx];
228   if (!n) ail->array[a_destidx] = ail->array[a_srcidx];
229   else {
230     do {
231       if (!n->next) {
232         n->next = ail->array[a_srcidx]; // append
233         break;
234       }
235       n = n->next;
236     } while (1);
237   }
238   ail->array[a_srcidx] = NULL; // empty
239   PetscFunctionReturn(PETSC_SUCCESS);
240 }
241 
242 /* PetscCDRemoveAllAt - empty one list and move data to cache
243  */
244 PetscErrorCode PetscCDRemoveAllAt(PetscCoarsenData *ail, PetscInt a_idx)
245 {
246   PetscCDIntNd *rem, *n1;
247 
248   PetscFunctionBegin;
249   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_idx);
250   rem               = ail->array[a_idx];
251   ail->array[a_idx] = NULL;
252   if (!(n1 = ail->extra_nodes)) ail->extra_nodes = rem;
253   else {
254     while (n1->next) n1 = n1->next;
255     n1->next = rem;
256   }
257   PetscFunctionReturn(PETSC_SUCCESS);
258 }
259 
260 /* PetscCDCountAt
261  */
262 PetscErrorCode PetscCDCountAt(const PetscCoarsenData *ail, PetscInt a_idx, PetscInt *a_sz)
263 {
264   PetscCDIntNd *n1;
265   PetscInt      sz = 0;
266 
267   PetscFunctionBegin;
268   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_idx);
269   n1 = ail->array[a_idx];
270   while (n1) {
271     n1 = n1->next;
272     sz++;
273   }
274   *a_sz = sz;
275   PetscFunctionReturn(PETSC_SUCCESS);
276 }
277 
278 /* PetscCDSize
279  */
280 PetscErrorCode PetscCDCount(const PetscCoarsenData *ail, PetscInt *a_sz)
281 {
282   PetscInt sz = 0;
283 
284   PetscFunctionBegin;
285   for (int ii = 0; ii < ail->size; ii++) {
286     PetscCDIntNd *n1 = ail->array[ii];
287     while (n1) {
288       n1 = n1->next;
289       sz++;
290     }
291   }
292   *a_sz = sz;
293   PetscFunctionReturn(PETSC_SUCCESS);
294 }
295 
296 /* PetscCDIsEmptyAt - Is the list empty? (not used)
297  */
298 PetscErrorCode PetscCDIsEmptyAt(const PetscCoarsenData *ail, PetscInt a_idx, PetscBool *a_e)
299 {
300   PetscFunctionBegin;
301   PetscCheck(a_idx < ail->size, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Index %" PetscInt_FMT " out of range.", a_idx);
302   *a_e = (PetscBool)(ail->array[a_idx] == NULL);
303   PetscFunctionReturn(PETSC_SUCCESS);
304 }
305 
306 /* PetscCDGetNonemptyIS - used for C-F methods
307  */
308 PetscErrorCode PetscCDGetNonemptyIS(PetscCoarsenData *ail, IS *a_mis)
309 {
310   PetscCDIntNd *n;
311   PetscInt      ii, kk;
312   PetscInt     *permute;
313 
314   PetscFunctionBegin;
315   for (ii = kk = 0; ii < ail->size; ii++) {
316     n = ail->array[ii];
317     if (n) kk++;
318   }
319   PetscCall(PetscMalloc1(kk, &permute));
320   for (ii = kk = 0; ii < ail->size; ii++) {
321     n = ail->array[ii];
322     if (n) permute[kk++] = ii;
323   }
324   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, kk, permute, PETSC_OWN_POINTER, a_mis));
325   PetscFunctionReturn(PETSC_SUCCESS);
326 }
327 
328 /* PetscCDGetMat
329  */
330 PetscErrorCode PetscCDGetMat(PetscCoarsenData *ail, Mat *a_mat)
331 {
332   PetscFunctionBegin;
333   *a_mat = ail->mat;
334   PetscFunctionReturn(PETSC_SUCCESS);
335 }
336 
337 /* PetscCDSetMat
338  */
339 PetscErrorCode PetscCDSetMat(PetscCoarsenData *ail, Mat a_mat)
340 {
341   PetscFunctionBegin;
342   if (ail->mat) {
343     PetscCall(MatDestroy(&ail->mat)); //should not happen
344   }
345   ail->mat = a_mat;
346   PetscFunctionReturn(PETSC_SUCCESS);
347 }
348 
349 /* PetscCDClearMat
350  */
351 PetscErrorCode PetscCDClearMat(PetscCoarsenData *ail)
352 {
353   PetscFunctionBegin;
354   ail->mat = NULL;
355   PetscFunctionReturn(PETSC_SUCCESS);
356 }
357 
358 /* PetscCDGetASMBlocks - get IS of aggregates for ASM smoothers
359  */
360 PetscErrorCode PetscCDGetASMBlocks(const PetscCoarsenData *ail, const PetscInt a_bs, PetscInt *a_sz, IS **a_local_is)
361 {
362   PetscCDIntNd *n;
363   PetscInt      lsz, ii, kk, *idxs, jj, gid;
364   IS           *is_loc = NULL;
365 
366   PetscFunctionBegin;
367   for (ii = kk = 0; ii < ail->size; ii++) {
368     if (ail->array[ii]) kk++;
369   }
370   *a_sz = kk;
371   PetscCall(PetscMalloc1(kk, &is_loc));
372   for (ii = kk = 0; ii < ail->size; ii++) {
373     for (lsz = 0, n = ail->array[ii]; n; lsz++, n = n->next) /* void */
374       ;
375     if (lsz) {
376       PetscCall(PetscMalloc1(a_bs * lsz, &idxs));
377       for (lsz = 0, n = ail->array[ii]; n; n = n->next) {
378         PetscCall(PetscCDIntNdGetID(n, &gid));
379         for (jj = 0; jj < a_bs; lsz++, jj++) idxs[lsz] = a_bs * gid + jj;
380       }
381       PetscCall(ISCreateGeneral(PETSC_COMM_SELF, lsz, idxs, PETSC_OWN_POINTER, &is_loc[kk++]));
382     }
383   }
384   PetscCheck(*a_sz == kk, PETSC_COMM_SELF, PETSC_ERR_PLIB, "*a_sz %" PetscInt_FMT " != kk %" PetscInt_FMT, *a_sz, kk);
385   *a_local_is = is_loc; /* out */
386 
387   PetscFunctionReturn(PETSC_SUCCESS);
388 }
389 
390 /* edge for priority queue */
391 typedef struct edge_tag {
392   PetscReal weight;
393   PetscInt  lid0, gid1, ghost1_idx;
394 } Edge;
395 
396 #define MY_MEPS (PETSC_MACHINE_EPSILON * 100)
397 static int gamg_hem_compare(const void *a, const void *b)
398 {
399   PetscReal va = ((Edge *)a)->weight, vb = ((Edge *)b)->weight;
400   return (va <= vb - MY_MEPS) ? 1 : (va > vb + MY_MEPS) ? -1 : 0; /* 0 for equal */
401 }
402 
403 /*
404   MatCoarsenApply_HEM_private - parallel heavy edge matching
405 
406   Input Parameter:
407    . a_Gmat - global matrix of the graph
408    . n_iter - number of matching iterations
409    . threshold - threshold for filtering graphs
410 
411   Output Parameter:
412    . a_locals_llist - array of list of local nodes rooted at local node
413 */
414 static PetscErrorCode MatCoarsenApply_HEM_private(Mat a_Gmat, const PetscInt n_iter, const PetscReal threshold, PetscCoarsenData **a_locals_llist)
415 {
416 #define REQ_BF_SIZE 100
417   PetscBool         isMPI;
418   MPI_Comm          comm;
419   PetscInt          ix, *ii, *aj, Iend, my0, ncomm_procs, bc_agg = -1, *rbuff = NULL, rbuff_sz = 0;
420   PetscMPIInt       rank, size, comm_procs[REQ_BF_SIZE], *lid_max_pe;
421   const PetscInt    nloc = a_Gmat->rmap->n, request_size = PetscCeilReal((PetscReal)sizeof(MPI_Request) / (PetscReal)sizeof(PetscInt));
422   PetscInt         *lid_cprowID;
423   PetscBool        *lid_matched;
424   Mat_SeqAIJ       *matA, *matB = NULL;
425   Mat_MPIAIJ       *mpimat     = NULL;
426   PetscScalar       one        = 1.;
427   PetscCoarsenData *agg_llists = NULL, *ghost_deleted_list = NULL, *bc_list = NULL;
428   Mat               cMat, tMat, P;
429   MatScalar        *ap;
430   IS                info_is;
431 
432   PetscFunctionBegin;
433   PetscCall(PetscObjectGetComm((PetscObject)a_Gmat, &comm));
434   PetscCallMPI(MPI_Comm_rank(comm, &rank));
435   PetscCallMPI(MPI_Comm_size(comm, &size));
436   PetscCall(MatGetOwnershipRange(a_Gmat, &my0, &Iend));
437   PetscCall(ISCreate(comm, &info_is));
438   PetscCall(PetscInfo(info_is, "Start %" PetscInt_FMT " iterations of HEM.\n", n_iter));
439 
440   PetscCall(PetscMalloc1(nloc, &lid_matched));
441   PetscCall(PetscMalloc1(nloc, &lid_cprowID));
442   PetscCall(PetscMalloc1(nloc, &lid_max_pe));
443 
444   PetscCall(PetscCDCreate(nloc, &agg_llists));
445   PetscCall(PetscCDSetChunkSize(agg_llists, nloc + 1));
446   *a_locals_llist = agg_llists;
447   /* add self to all lists */
448   for (int kk = 0; kk < nloc; kk++) PetscCall(PetscCDAppendID(agg_llists, kk, my0 + kk));
449   /* make a copy of the graph, this gets destroyed in iterates */
450   PetscCall(MatDuplicate(a_Gmat, MAT_COPY_VALUES, &cMat));
451   PetscCall(MatConvert(cMat, MATAIJ, MAT_INPLACE_MATRIX, &cMat));
452   isMPI = (PetscBool)(size > 1);
453   if (isMPI) {
454     /* list of deleted ghosts, should compress this */
455     PetscCall(PetscCDCreate(size, &ghost_deleted_list));
456     PetscCall(PetscCDSetChunkSize(ghost_deleted_list, 100));
457   }
458   for (int iter = 0; iter < n_iter; iter++) {
459     const PetscScalar *lghost_max_ew, *lid_max_ew;
460     PetscBool         *lghost_matched;
461     PetscMPIInt       *lghost_pe, *lghost_max_pe;
462     Vec                locMaxEdge, ghostMaxEdge, ghostMaxPE, locMaxPE;
463     PetscInt          *lghost_gid, nEdges, nEdges0, num_ghosts = 0;
464     Edge              *Edges;
465     const int          n_sub_its = 1000; // in case of a bug, stop at some point
466     /* get submatrices of cMat */
467     for (int kk = 0; kk < nloc; kk++) lid_cprowID[kk] = -1;
468     if (isMPI) {
469       mpimat = (Mat_MPIAIJ *)cMat->data;
470       matA   = (Mat_SeqAIJ *)mpimat->A->data;
471       matB   = (Mat_SeqAIJ *)mpimat->B->data;
472       if (!matB->compressedrow.use) {
473         /* force construction of compressed row data structure since code below requires it */
474         PetscCall(MatCheckCompressedRow(mpimat->B, matB->nonzerorowcnt, &matB->compressedrow, matB->i, mpimat->B->rmap->n, -1.0));
475       }
476       /* set index into compressed row 'lid_cprowID' */
477       for (ix = 0; ix < matB->compressedrow.nrows; ix++) {
478         PetscInt *ridx = matB->compressedrow.rindex, lid = ridx[ix];
479         if (ridx[ix] >= 0) lid_cprowID[lid] = ix;
480       }
481     } else {
482       matA = (Mat_SeqAIJ *)cMat->data;
483     }
484     /* set matched flags: true for empty list */
485     for (int kk = 0; kk < nloc; kk++) {
486       PetscCall(PetscCDCountAt(agg_llists, kk, &ix));
487       if (ix > 0) lid_matched[kk] = PETSC_FALSE;
488       else lid_matched[kk] = PETSC_TRUE; // call deleted gids as matched
489     }
490     /* max edge and pe vecs */
491     PetscCall(MatCreateVecs(cMat, &locMaxEdge, NULL));
492     PetscCall(MatCreateVecs(cMat, &locMaxPE, NULL));
493     /* get 'lghost_pe' & 'lghost_gid' & init. 'lghost_matched' using 'mpimat->lvec' */
494     if (isMPI) {
495       Vec                vec;
496       PetscScalar        vval;
497       const PetscScalar *buf;
498       PetscCall(MatCreateVecs(cMat, &vec, NULL));
499       PetscCall(VecGetLocalSize(mpimat->lvec, &num_ghosts));
500       /* lghost_matched */
501       for (PetscInt kk = 0, gid = my0; kk < nloc; kk++, gid++) {
502         PetscScalar vval = lid_matched[kk] ? 1.0 : 0.0;
503         PetscCall(VecSetValues(vec, 1, &gid, &vval, INSERT_VALUES));
504       }
505       PetscCall(VecAssemblyBegin(vec));
506       PetscCall(VecAssemblyEnd(vec));
507       PetscCall(VecScatterBegin(mpimat->Mvctx, vec, mpimat->lvec, INSERT_VALUES, SCATTER_FORWARD));
508       PetscCall(VecScatterEnd(mpimat->Mvctx, vec, mpimat->lvec, INSERT_VALUES, SCATTER_FORWARD));
509       PetscCall(VecGetArrayRead(mpimat->lvec, &buf)); /* get proc ID in 'buf' */
510       PetscCall(PetscMalloc1(num_ghosts, &lghost_matched));
511       for (int kk = 0; kk < num_ghosts; kk++) {
512         lghost_matched[kk] = (PetscBool)(PetscRealPart(buf[kk]) != 0); // the proc of the ghost for now
513       }
514       PetscCall(VecRestoreArrayRead(mpimat->lvec, &buf));
515       /* lghost_pe */
516       vval = (PetscScalar)(rank);
517       for (PetscInt kk = 0, gid = my0; kk < nloc; kk++, gid++) PetscCall(VecSetValues(vec, 1, &gid, &vval, INSERT_VALUES)); /* set with GID */
518       PetscCall(VecAssemblyBegin(vec));
519       PetscCall(VecAssemblyEnd(vec));
520       PetscCall(VecScatterBegin(mpimat->Mvctx, vec, mpimat->lvec, INSERT_VALUES, SCATTER_FORWARD));
521       PetscCall(VecScatterEnd(mpimat->Mvctx, vec, mpimat->lvec, INSERT_VALUES, SCATTER_FORWARD));
522       PetscCall(VecGetArrayRead(mpimat->lvec, &buf)); /* get proc ID in 'buf' */
523       PetscCall(PetscMalloc1(num_ghosts, &lghost_pe));
524       for (int kk = 0; kk < num_ghosts; kk++) lghost_pe[kk] = (PetscMPIInt)PetscRealPart(buf[kk]); // the proc of the ghost for now
525       PetscCall(VecRestoreArrayRead(mpimat->lvec, &buf));
526       /* lghost_gid */
527       for (PetscInt kk = 0, gid = my0; kk < nloc; kk++, gid++) {
528         vval = (PetscScalar)(gid);
529         PetscCall(VecSetValues(vec, 1, &gid, &vval, INSERT_VALUES)); /* set with GID */
530       }
531       PetscCall(VecAssemblyBegin(vec));
532       PetscCall(VecAssemblyEnd(vec));
533       PetscCall(VecScatterBegin(mpimat->Mvctx, vec, mpimat->lvec, INSERT_VALUES, SCATTER_FORWARD));
534       PetscCall(VecScatterEnd(mpimat->Mvctx, vec, mpimat->lvec, INSERT_VALUES, SCATTER_FORWARD));
535       PetscCall(VecDestroy(&vec));
536       PetscCall(VecGetArrayRead(mpimat->lvec, &buf)); /* get proc ID in 'lghost_gid' */
537       PetscCall(PetscMalloc1(num_ghosts, &lghost_gid));
538       for (int kk = 0; kk < num_ghosts; kk++) lghost_gid[kk] = (PetscInt)PetscRealPart(buf[kk]);
539       PetscCall(VecRestoreArrayRead(mpimat->lvec, &buf));
540     }
541     // get 'comm_procs' (could hoist)
542     for (int kk = 0; kk < REQ_BF_SIZE; kk++) comm_procs[kk] = -1;
543     for (ix = 0, ncomm_procs = 0; ix < num_ghosts; ix++) {
544       PetscMPIInt proc = lghost_pe[ix], idx = -1;
545       for (int k = 0; k < ncomm_procs && idx == -1; k++)
546         if (comm_procs[k] == proc) idx = k;
547       if (idx == -1) { comm_procs[ncomm_procs++] = proc; }
548       PetscCheck(ncomm_procs != REQ_BF_SIZE, PETSC_COMM_SELF, PETSC_ERR_SUP, "Receive request array too small: %d", (int)ncomm_procs);
549     }
550     /* count edges, compute initial 'locMaxEdge', 'locMaxPE' */
551     nEdges0 = 0;
552     for (PetscInt kk = 0, gid = my0; kk < nloc; kk++, gid++) {
553       PetscReal   max_e = 0., tt;
554       PetscScalar vval;
555       PetscInt    lid = kk, max_pe = rank, pe, n;
556       ii = matA->i;
557       n  = ii[lid + 1] - ii[lid];
558       aj = PetscSafePointerPlusOffset(matA->j, ii[lid]);
559       ap = PetscSafePointerPlusOffset(matA->a, ii[lid]);
560       for (int jj = 0; jj < n; jj++) {
561         PetscInt lidj = aj[jj];
562         if ((tt = PetscRealPart(ap[jj])) > threshold && lidj != lid) {
563           if (tt > max_e) max_e = tt;
564           if (lidj > lid) nEdges0++;
565         }
566       }
567       if ((ix = lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */
568         ii = matB->compressedrow.i;
569         n  = ii[ix + 1] - ii[ix];
570         ap = matB->a + ii[ix];
571         aj = matB->j + ii[ix];
572         for (int jj = 0; jj < n; jj++) {
573           if ((tt = PetscRealPart(ap[jj])) > threshold) {
574             if (tt > max_e) max_e = tt;
575             nEdges0++;
576             if ((pe = lghost_pe[aj[jj]]) > max_pe) max_pe = pe;
577           }
578         }
579       }
580       vval = max_e;
581       PetscCall(VecSetValues(locMaxEdge, 1, &gid, &vval, INSERT_VALUES));
582       vval = (PetscScalar)max_pe;
583       PetscCall(VecSetValues(locMaxPE, 1, &gid, &vval, INSERT_VALUES));
584       if (iter == 0 && max_e <= MY_MEPS) { // add BCs to fake aggregate
585         lid_matched[lid] = PETSC_TRUE;
586         if (bc_agg == -1) {
587           bc_agg = lid;
588           PetscCall(PetscCDCreate(1, &bc_list));
589         }
590         PetscCall(PetscCDRemoveAllAt(agg_llists, lid));
591         PetscCall(PetscCDAppendID(bc_list, 0, my0 + lid));
592       }
593     }
594     PetscCall(VecAssemblyBegin(locMaxEdge));
595     PetscCall(VecAssemblyEnd(locMaxEdge));
596     PetscCall(VecAssemblyBegin(locMaxPE));
597     PetscCall(VecAssemblyEnd(locMaxPE));
598     /* make 'ghostMaxEdge_max_ew', 'lghost_max_pe' */
599     if (mpimat) {
600       const PetscScalar *buf;
601       PetscCall(VecDuplicate(mpimat->lvec, &ghostMaxEdge));
602       PetscCall(VecScatterBegin(mpimat->Mvctx, locMaxEdge, ghostMaxEdge, INSERT_VALUES, SCATTER_FORWARD));
603       PetscCall(VecScatterEnd(mpimat->Mvctx, locMaxEdge, ghostMaxEdge, INSERT_VALUES, SCATTER_FORWARD));
604 
605       PetscCall(VecDuplicate(mpimat->lvec, &ghostMaxPE));
606       PetscCall(VecScatterBegin(mpimat->Mvctx, locMaxPE, ghostMaxPE, INSERT_VALUES, SCATTER_FORWARD));
607       PetscCall(VecScatterEnd(mpimat->Mvctx, locMaxPE, ghostMaxPE, INSERT_VALUES, SCATTER_FORWARD));
608       PetscCall(VecGetArrayRead(ghostMaxPE, &buf));
609       PetscCall(PetscMalloc1(num_ghosts, &lghost_max_pe));
610       for (int kk = 0; kk < num_ghosts; kk++) lghost_max_pe[kk] = (PetscMPIInt)PetscRealPart(buf[kk]); // the MAX proc of the ghost now
611       PetscCall(VecRestoreArrayRead(ghostMaxPE, &buf));
612     }
613     { // make lid_max_pe
614       const PetscScalar *buf;
615       PetscCall(VecGetArrayRead(locMaxPE, &buf));
616       for (int kk = 0; kk < nloc; kk++) lid_max_pe[kk] = (PetscMPIInt)PetscRealPart(buf[kk]); // the MAX proc of the ghost now
617       PetscCall(VecRestoreArrayRead(locMaxPE, &buf));
618     }
619     /* setup sorted list of edges, and make 'Edges' */
620     PetscCall(PetscMalloc1(nEdges0, &Edges));
621     nEdges = 0;
622     for (int kk = 0, n; kk < nloc; kk++) {
623       const PetscInt lid = kk;
624       PetscReal      tt;
625       ii = matA->i;
626       n  = ii[lid + 1] - ii[lid];
627       aj = PetscSafePointerPlusOffset(matA->j, ii[lid]);
628       ap = PetscSafePointerPlusOffset(matA->a, ii[lid]);
629       for (int jj = 0; jj < n; jj++) {
630         PetscInt lidj = aj[jj];
631         if ((tt = PetscRealPart(ap[jj])) > threshold && lidj != lid) {
632           if (lidj > lid) {
633             Edges[nEdges].lid0       = lid;
634             Edges[nEdges].gid1       = lidj + my0;
635             Edges[nEdges].ghost1_idx = -1;
636             Edges[nEdges].weight     = tt;
637             nEdges++;
638           }
639         }
640       }
641       if ((ix = lid_cprowID[lid]) != -1) { /* if I have any ghost neighbor */
642         ii = matB->compressedrow.i;
643         n  = ii[ix + 1] - ii[ix];
644         ap = matB->a + ii[ix];
645         aj = matB->j + ii[ix];
646         for (int jj = 0; jj < n; jj++) {
647           if ((tt = PetscRealPart(ap[jj])) > threshold) {
648             Edges[nEdges].lid0       = lid;
649             Edges[nEdges].gid1       = lghost_gid[aj[jj]];
650             Edges[nEdges].ghost1_idx = aj[jj];
651             Edges[nEdges].weight     = tt;
652             nEdges++;
653           }
654         }
655       }
656     }
657     PetscCheck(nEdges == nEdges0, PETSC_COMM_SELF, PETSC_ERR_SUP, "nEdges != nEdges0: %d %d", (int)nEdges0, (int)nEdges);
658     if (Edges) qsort(Edges, nEdges, sizeof(Edge), gamg_hem_compare);
659 
660     PetscCall(PetscInfo(info_is, "[%d] HEM iteration %d with %d edges\n", rank, iter, (int)nEdges));
661 
662     /* projection matrix */
663     PetscCall(MatCreate(comm, &P));
664     PetscCall(MatSetType(P, MATAIJ));
665     PetscCall(MatSetSizes(P, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE));
666     PetscCall(MatMPIAIJSetPreallocation(P, 1, NULL, 1, NULL));
667     PetscCall(MatSeqAIJSetPreallocation(P, 1, NULL));
668     PetscCall(MatSetUp(P));
669     /* process - communicate - process */
670     for (int sub_it = 0, old_num_edge = 0; /* sub_it < n_sub_its */; /* sub_it++ */) {
671       PetscInt    nactive_edges = 0, n_act_n[3], gn_act_n[3];
672       PetscMPIInt tag1, tag2;
673       PetscCall(VecGetArrayRead(locMaxEdge, &lid_max_ew));
674       if (isMPI) {
675         PetscCall(VecGetArrayRead(ghostMaxEdge, &lghost_max_ew));
676         PetscCall(PetscCommGetNewTag(comm, &tag1));
677         PetscCall(PetscCommGetNewTag(comm, &tag2));
678       }
679       for (int kk = 0; kk < nEdges; kk++) {
680         /* HEM */
681         const Edge    *e    = &Edges[kk];
682         const PetscInt lid0 = e->lid0, gid1 = e->gid1, ghost1_idx = e->ghost1_idx, gid0 = lid0 + my0, lid1 = gid1 - my0;
683         PetscBool      isOK = PETSC_TRUE, print = PETSC_FALSE;
684         if (print) PetscCall(PetscSynchronizedPrintf(comm, "\t[%d] edge (%d %d), %d %d %d\n", rank, (int)gid0, (int)gid1, lid_matched[lid0], (ghost1_idx != -1 && lghost_matched[ghost1_idx]), (ghost1_idx == -1 && lid_matched[lid1])));
685         /* skip if either vertex is matched already */
686         if (lid_matched[lid0] || (ghost1_idx != -1 && lghost_matched[ghost1_idx]) || (ghost1_idx == -1 && lid_matched[lid1])) continue;
687 
688         nactive_edges++;
689         PetscCheck(PetscRealPart(lid_max_ew[lid0]) >= e->weight - MY_MEPS, PETSC_COMM_SELF, PETSC_ERR_SUP, "edge weight %e > max %e", (double)e->weight, (double)PetscRealPart(lid_max_ew[lid0]));
690         if (print) PetscCall(PetscSynchronizedPrintf(comm, "\t[%d] active edge (%d %d), diff0 = %10.4e\n", rank, (int)gid0, (int)gid1, (double)(PetscRealPart(lid_max_ew[lid0]) - (double)e->weight)));
691         // smaller edge, lid_max_ew get updated - e0
692         if (PetscRealPart(lid_max_ew[lid0]) > e->weight + MY_MEPS) {
693           if (print)
694             PetscCall(PetscSynchronizedPrintf(comm, "\t\t[%d] 1) e0 SKIPPING small edge %20.14e edge (%d %d), diff = %10.4e to proc %d. max = %20.14e, w = %20.14e\n", rank, (double)e->weight, (int)gid0, (int)gid1, (double)(PetscRealPart(lid_max_ew[lid0]) - e->weight), ghost1_idx != -1 ? (int)lghost_pe[ghost1_idx] : rank, (double)PetscRealPart(lid_max_ew[lid0]),
695                                               (double)e->weight));
696           continue; // we are basically filter edges here
697         }
698         // e1 - local
699         if (ghost1_idx == -1) {
700           if (PetscRealPart(lid_max_ew[lid1]) > e->weight + MY_MEPS) {
701             if (print)
702               PetscCall(PetscSynchronizedPrintf(comm, "\t\t%c[%d] 2) e1 SKIPPING small local edge %20.14e edge (%d %d), diff = %10.4e\n", ghost1_idx != -1 ? '\t' : ' ', rank, (double)e->weight, (int)gid0, (int)gid1, (double)(PetscRealPart(lid_max_ew[lid1]) - e->weight)));
703             continue; // we are basically filter edges here
704           }
705         } else { // e1 - ghost
706           /* see if edge might get matched on other proc */
707           PetscReal g_max_e1 = PetscRealPart(lghost_max_ew[ghost1_idx]);
708           if (print)
709             PetscCall(PetscSynchronizedPrintf(comm, "\t\t\t[%d] CHECK GHOST e1, edge (%d %d), E0 MAX EDGE WEIGHT = %10.4e, EDGE WEIGHT = %10.4e, diff1 = %10.4e, ghost proc %d with max pe %d on e0 and %d on e1\n", rank, (int)gid0, (int)gid1, (double)PetscRealPart(lid_max_ew[lid0]),
710                                               (double)e->weight, (double)(PetscRealPart(lghost_max_ew[ghost1_idx]) - e->weight), (int)lghost_pe[ghost1_idx], lid_max_pe[lid0], lghost_max_pe[ghost1_idx]));
711           if (g_max_e1 > e->weight + MY_MEPS) {
712             /* PetscCall(PetscSynchronizedPrintf(comm,"\t\t\t\t[%d] 3) ghost e1 SKIPPING small edge (%d %d), diff = %10.4e from proc %d with max pe %d. max = %20.14e, w = %20.14e\n", rank, (int)gid0, (int)gid1, g_max_e1 - e->weight, (int)lghost_pe[ghost1_idx], lghost_max_pe[ghost1_idx], g_max_e1, e->weight )); */
713             continue;
714           } else if (g_max_e1 >= e->weight - MY_MEPS && lghost_pe[ghost1_idx] > rank) { // is 'lghost_max_pe[ghost1_idx] > rank' needed?
715             /* check for max_ea == to this edge and larger processor that will deal with this */
716             if (print)
717               PetscCall(PetscSynchronizedPrintf(comm, "\t\t\t[%d] ghost e1 SKIPPING EQUAL (%d %d), diff = %10.4e from larger proc %d with max pe %d. max = %20.14e, w = %20.14e\n", rank, (int)gid0, (int)gid1,
718                                                 (double)(PetscRealPart(lid_max_ew[lid0]) - (double)e->weight), (int)lghost_pe[ghost1_idx], (int)lghost_max_pe[ghost1_idx], (double)g_max_e1, (double)e->weight));
719             isOK = PETSC_FALSE; // this guy could delete me
720             continue;
721           } else {
722             /* PetscCall(PetscSynchronizedPrintf(comm,"\t[%d] Edge (%d %d) passes gid0 tests, diff = %10.4e from proc %d with max pe %d. max = %20.14e, w = %20.14e\n", rank, (int)gid0, (int)gid1, g_max_e1 - e->weight, (int)lghost_pe[ghost1_idx], lghost_max_pe[ghost1_idx], g_max_e1, e->weight )); */
723           }
724         }
725         /* check ghost for v0 */
726         if (isOK) {
727           PetscReal max_e, ew;
728           if ((ix = lid_cprowID[lid0]) != -1) { /* if I have any ghost neighbors */
729             int n;
730             ii = matB->compressedrow.i;
731             n  = ii[ix + 1] - ii[ix];
732             ap = matB->a + ii[ix];
733             aj = matB->j + ii[ix];
734             for (int jj = 0; jj < n && isOK; jj++) {
735               PetscInt lidj = aj[jj];
736               if (lghost_matched[lidj]) continue;
737               ew = PetscRealPart(ap[jj]);
738               if (ew <= threshold) continue;
739               max_e = PetscRealPart(lghost_max_ew[lidj]);
740               /* check for max_e == to this edge and larger processor that will deal with this */
741               if (ew >= PetscRealPart(lid_max_ew[lid0]) - MY_MEPS && lghost_max_pe[lidj] > rank) isOK = PETSC_FALSE;
742               PetscCheck(ew <= max_e + MY_MEPS, PETSC_COMM_SELF, PETSC_ERR_SUP, "edge weight %e > max %e. ncols = %d, gid0 = %d, gid1 = %d", (double)PetscRealPart(ew), (double)PetscRealPart(max_e), (int)n, (int)(lid0 + my0), (int)lghost_gid[lidj]);
743               if (print)
744                 PetscCall(PetscSynchronizedPrintf(comm, "\t\t\t\t[%d] e0: looked at ghost adj (%d %d), diff = %10.4e, ghost on proc %d (max %d). isOK = %d, %d %d %d; ew = %e, lid0 max ew = %e, diff = %e, eps = %e\n", rank, (int)gid0, (int)lghost_gid[lidj], (double)(max_e - ew), lghost_pe[lidj], lghost_max_pe[lidj], isOK, (double)(ew) >= (double)(max_e - MY_MEPS), ew >= PetscRealPart(lid_max_ew[lid0]) - MY_MEPS, lghost_pe[lidj] > rank, (double)ew, (double)PetscRealPart(lid_max_ew[lid0]), (double)(ew - PetscRealPart(lid_max_ew[lid0])), (double)MY_MEPS));
745             }
746             if (!isOK && print) PetscCall(PetscSynchronizedPrintf(comm, "\t\t[%d] skip edge (%d %d) from ghost inspection\n", rank, (int)gid0, (int)gid1));
747           }
748           /* check local v1 */
749           if (ghost1_idx == -1) {
750             if ((ix = lid_cprowID[lid1]) != -1) { /* if I have any ghost neighbors */
751               int n;
752               ii = matB->compressedrow.i;
753               n  = ii[ix + 1] - ii[ix];
754               ap = matB->a + ii[ix];
755               aj = matB->j + ii[ix];
756               for (int jj = 0; jj < n && isOK; jj++) {
757                 PetscInt lidj = aj[jj];
758                 if (lghost_matched[lidj]) continue;
759                 ew = PetscRealPart(ap[jj]);
760                 if (ew <= threshold) continue;
761                 max_e = PetscRealPart(lghost_max_ew[lidj]);
762                 /* check for max_e == to this edge and larger processor that will deal with this */
763                 if (ew >= PetscRealPart(lid_max_ew[lid1]) - MY_MEPS && lghost_max_pe[lidj] > rank) isOK = PETSC_FALSE;
764                 PetscCheck(ew <= max_e + MY_MEPS, PETSC_COMM_SELF, PETSC_ERR_SUP, "edge weight %e > max %e", (double)PetscRealPart(ew), (double)PetscRealPart(max_e));
765                 if (print)
766                   PetscCall(PetscSynchronizedPrintf(comm, "\t\t\t\t\t[%d] e1: looked at ghost adj (%d %d), diff = %10.4e, ghost on proc %d (max %d)\n", rank, (int)gid0, (int)lghost_gid[lidj], (double)(max_e - ew), lghost_pe[lidj], lghost_max_pe[lidj]));
767               }
768             }
769             if (!isOK && print) PetscCall(PetscSynchronizedPrintf(comm, "\t\t[%d] skip edge (%d %d) from ghost inspection\n", rank, (int)gid0, (int)gid1));
770           }
771         }
772         PetscReal e1_max_w = (ghost1_idx == -1 ? PetscRealPart(lid_max_ew[lid0]) : PetscRealPart(lghost_max_ew[ghost1_idx]));
773         if (print)
774           PetscCall(PetscSynchronizedPrintf(comm, "\t[%d] MATCHING (%d %d) e1 max weight = %e, e1 wight diff %e, %s. isOK = %d\n", rank, (int)gid0, (int)gid1, (double)e1_max_w, (double)(e1_max_w - e->weight), ghost1_idx == -1 ? "local" : "ghost", isOK));
775         /* do it */
776         if (isOK) {
777           if (ghost1_idx == -1) {
778             PetscCheck(!lid_matched[lid1], PETSC_COMM_SELF, PETSC_ERR_SUP, "local %d is matched", (int)gid1);
779             lid_matched[lid1] = PETSC_TRUE;                       /* keep track of what we've done this round */
780             PetscCall(PetscCDMoveAppend(agg_llists, lid0, lid1)); // takes lid1's list and appends to lid0's
781           } else {
782             /* add gid1 to list of ghost deleted by me -- I need their children */
783             PetscMPIInt proc = lghost_pe[ghost1_idx];
784             PetscCheck(!lghost_matched[ghost1_idx], PETSC_COMM_SELF, PETSC_ERR_SUP, "ghost %d is matched", (int)lghost_gid[ghost1_idx]);
785             lghost_matched[ghost1_idx] = PETSC_TRUE;
786             PetscCall(PetscCDAppendID(ghost_deleted_list, proc, ghost1_idx)); /* cache to send messages */
787             PetscCall(PetscCDAppendID(ghost_deleted_list, proc, lid0));
788           }
789           lid_matched[lid0] = PETSC_TRUE; /* keep track of what we've done this round */
790           /* set projection */
791           PetscCall(MatSetValues(P, 1, &gid0, 1, &gid0, &one, INSERT_VALUES));
792           PetscCall(MatSetValues(P, 1, &gid1, 1, &gid0, &one, INSERT_VALUES));
793           //PetscCall(PetscPrintf(comm,"\t %d.%d) match active EDGE %d : (%d %d)\n",iter,sub_it, (int)nactive_edges, (int)gid0, (int)gid1));
794         } /* matched */
795       }   /* edge loop */
796       PetscCall(PetscSynchronizedFlush(comm, PETSC_STDOUT));
797       if (isMPI) PetscCall(VecRestoreArrayRead(ghostMaxEdge, &lghost_max_ew));
798       PetscCall(VecRestoreArrayRead(locMaxEdge, &lid_max_ew));
799       // count active for test, latter, update deleted ghosts
800       n_act_n[0] = nactive_edges;
801       if (ghost_deleted_list) PetscCall(PetscCDCount(ghost_deleted_list, &n_act_n[2]));
802       else n_act_n[2] = 0;
803       PetscCall(PetscCDCount(agg_llists, &n_act_n[1]));
804       PetscCall(MPIU_Allreduce(n_act_n, gn_act_n, 3, MPIU_INT, MPI_SUM, comm));
805       PetscCall(PetscInfo(info_is, "[%d] %d.%d) nactive edges=%" PetscInt_FMT ", ncomm_procs=%d, nEdges=%d, %" PetscInt_FMT " deleted ghosts, N=%" PetscInt_FMT "\n", rank, iter, sub_it, gn_act_n[0], (int)ncomm_procs, (int)nEdges, gn_act_n[2], gn_act_n[1]));
806       /* deal with deleted ghost */
807       if (isMPI) {
808         PetscCDIntNd *pos;
809         PetscInt     *sbuffs1[REQ_BF_SIZE], ndel;
810         PetscInt     *sbuffs2[REQ_BF_SIZE];
811         MPI_Status    status;
812 
813         /* send deleted ghosts */
814         for (int proc_idx = 0; proc_idx < ncomm_procs; proc_idx++) {
815           const PetscMPIInt proc = comm_procs[proc_idx];
816           PetscInt         *sbuff, *pt, scount;
817           MPI_Request      *request;
818           /* count ghosts */
819           PetscCall(PetscCDCountAt(ghost_deleted_list, proc, &ndel));
820           ndel /= 2; // two entries for each proc
821           scount = 2 + 2 * ndel;
822           PetscCall(PetscMalloc1(scount + request_size, &sbuff));
823           /* save requests */
824           sbuffs1[proc_idx] = sbuff;
825           request           = (MPI_Request *)sbuff;
826           sbuff = pt = (PetscInt *)(sbuff + request_size);
827           /* write [ndel, proc, n*[gid1,gid0] */
828           *pt++ = ndel; // number of deleted to send
829           *pt++ = rank; // proc (not used)
830           PetscCall(PetscCDGetHeadPos(ghost_deleted_list, proc, &pos));
831           while (pos) {
832             PetscInt lid0, ghost_idx, gid1;
833             PetscCall(PetscCDIntNdGetID(pos, &ghost_idx));
834             gid1 = lghost_gid[ghost_idx];
835             PetscCall(PetscCDGetNextPos(ghost_deleted_list, proc, &pos));
836             PetscCall(PetscCDIntNdGetID(pos, &lid0));
837             PetscCall(PetscCDGetNextPos(ghost_deleted_list, proc, &pos));
838             *pt++ = gid1;
839             *pt++ = lid0 + my0; // gid0
840           }
841           PetscCheck(pt - sbuff == scount, PETSC_COMM_SELF, PETSC_ERR_SUP, "sbuff-pt != scount: %d", (int)(pt - sbuff));
842           /* MPI_Isend:  tag1 [ndel, proc, n*[gid1,gid0] ] */
843           PetscCallMPI(MPI_Isend(sbuff, scount, MPIU_INT, proc, tag1, comm, request));
844           PetscCall(PetscCDRemoveAllAt(ghost_deleted_list, proc)); // done with this list
845         }
846         /* receive deleted, send back partial aggregates, clear lists */
847         for (int proc_idx = 0; proc_idx < ncomm_procs; proc_idx++) {
848           PetscCallMPI(MPI_Probe(comm_procs[proc_idx] /* MPI_ANY_SOURCE */, tag1, comm, &status));
849           {
850             PetscInt         *pt, *pt2, *pt3, *sbuff, tmp;
851             MPI_Request      *request;
852             int               rcount, scount, ndel;
853             const PetscMPIInt proc = status.MPI_SOURCE;
854             PetscCallMPI(MPI_Get_count(&status, MPIU_INT, &rcount));
855             if (rcount > rbuff_sz) {
856               if (rbuff) PetscCall(PetscFree(rbuff));
857               PetscCall(PetscMalloc1(rcount, &rbuff));
858               rbuff_sz = rcount;
859             }
860             /* MPI_Recv: tag1 [ndel, proc, ndel*[gid1,gid0] ] */
861             PetscCallMPI(MPI_Recv(rbuff, rcount, MPIU_INT, proc, tag1, comm, &status));
862             /* read and count sends *[lid0, n, n*[gid] ] */
863             pt     = rbuff;
864             scount = 0;
865             ndel   = *pt++; // number of deleted to recv
866             tmp    = *pt++; // proc (not used)
867             while (ndel--) {
868               PetscInt gid1 = *pt++, lid1 = gid1 - my0;
869               int      gh_gid0 = *pt++; // gid on other proc (not used here to count)
870               PetscCheck(lid1 >= 0 && lid1 < nloc, PETSC_COMM_SELF, PETSC_ERR_SUP, "received ghost deleted %d", (int)gid1);
871               PetscCheck(!lid_matched[lid1], PETSC_COMM_SELF, PETSC_ERR_PLIB, "%d) received matched local gid %" PetscInt_FMT ",%d, with ghost (lid) %" PetscInt_FMT " from proc %d", sub_it, gid1, gh_gid0, tmp, proc);
872               lid_matched[lid1] = PETSC_TRUE;                    /* keep track of what we've done this round */
873               PetscCall(PetscCDCountAt(agg_llists, lid1, &tmp)); // n
874               /* PetscCheck(tmp == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "sending %d (!= 1) size aggregate. gid-0 %d, from %d (gid-1 %d)", (int)tmp, (int) gid, proc, gh_gid0); */
875               scount += tmp + 2; // lid0, n, n*[gid]
876             }
877             PetscCheck((pt - rbuff) == rcount, PETSC_COMM_SELF, PETSC_ERR_SUP, "receive buffer size != num read: %d; rcount: %d", (int)(pt - rbuff), rcount);
878             /* send tag2: *[gid0, n, n*[gid] ] */
879             PetscCall(PetscMalloc1(scount + request_size, &sbuff));
880             sbuffs2[proc_idx] = sbuff; /* cache request */
881             request           = (MPI_Request *)sbuff;
882             pt2 = sbuff = sbuff + request_size;
883             // read again: n, proc, n*[gid1,gid0]
884             pt   = rbuff;
885             ndel = *pt++;
886             tmp  = *pt++; // proc (not used)
887             while (ndel--) {
888               PetscInt gid1 = *pt++, lid1 = gid1 - my0, gh_gid0 = *pt++;
889               /* write [gid0, aggSz, aggSz[gid] ] */
890               *pt2++ = gh_gid0;
891               pt3    = pt2++; /* save pointer for later */
892               PetscCall(PetscCDGetHeadPos(agg_llists, lid1, &pos));
893               while (pos) {
894                 PetscInt gid;
895                 PetscCall(PetscCDIntNdGetID(pos, &gid));
896                 PetscCall(PetscCDGetNextPos(agg_llists, lid1, &pos));
897                 *pt2++ = gid;
898               }
899               *pt3 = (pt2 - pt3) - 1;
900               /* clear list */
901               PetscCall(PetscCDRemoveAllAt(agg_llists, lid1));
902             }
903             PetscCheck((pt2 - sbuff) == scount, PETSC_COMM_SELF, PETSC_ERR_SUP, "buffer size != num write: %d %d", (int)(pt2 - sbuff), (int)scount);
904             /* MPI_Isend: requested data tag2 *[lid0, n, n*[gid1] ] */
905             PetscCallMPI(MPI_Isend(sbuff, scount, MPIU_INT, proc, tag2, comm, request));
906           }
907         } // proc_idx
908         /* receive tag2 *[gid0, n, n*[gid] ] */
909         for (int proc_idx = 0; proc_idx < ncomm_procs; proc_idx++) {
910           PetscMPIInt proc;
911           PetscInt   *pt;
912           int         rcount;
913           PetscCallMPI(MPI_Probe(comm_procs[proc_idx] /* MPI_ANY_SOURCE */, tag2, comm, &status));
914           PetscCallMPI(MPI_Get_count(&status, MPIU_INT, &rcount));
915           if (rcount > rbuff_sz) {
916             if (rbuff) PetscCall(PetscFree(rbuff));
917             PetscCall(PetscMalloc1(rcount, &rbuff));
918             rbuff_sz = rcount;
919           }
920           proc = status.MPI_SOURCE;
921           /* MPI_Recv:  tag1 [n, proc, n*[gid1,lid0] ] */
922           PetscCallMPI(MPI_Recv(rbuff, rcount, MPIU_INT, proc, tag2, comm, &status));
923           pt = rbuff;
924           while (pt - rbuff < rcount) {
925             PetscInt gid0 = *pt++, n = *pt++;
926             while (n--) {
927               PetscInt gid1 = *pt++;
928               PetscCall(PetscCDAppendID(agg_llists, gid0 - my0, gid1));
929             }
930           }
931           PetscCheck((pt - rbuff) == rcount, PETSC_COMM_SELF, PETSC_ERR_SUP, "recv buffer size != num read: %d %d", (int)(pt - rbuff), (int)rcount);
932         }
933         /* wait for tag1 isends */
934         for (int proc_idx = 0; proc_idx < ncomm_procs; proc_idx++) {
935           MPI_Request *request;
936           request = (MPI_Request *)sbuffs1[proc_idx];
937           PetscCallMPI(MPI_Wait(request, &status));
938           PetscCall(PetscFree(sbuffs1[proc_idx]));
939         }
940         /* wait for tag2 isends */
941         for (int proc_idx = 0; proc_idx < ncomm_procs; proc_idx++) {
942           MPI_Request *request = (MPI_Request *)sbuffs2[proc_idx];
943           PetscCallMPI(MPI_Wait(request, &status));
944           PetscCall(PetscFree(sbuffs2[proc_idx]));
945         }
946       } /* MPI */
947       /* set 'lghost_matched' - use locMaxEdge, ghostMaxEdge (recomputed next) */
948       if (isMPI) {
949         const PetscScalar *sbuff;
950         for (PetscInt kk = 0, gid = my0; kk < nloc; kk++, gid++) {
951           PetscScalar vval = lid_matched[kk] ? 1.0 : 0.0;
952           PetscCall(VecSetValues(locMaxEdge, 1, &gid, &vval, INSERT_VALUES)); /* set with GID */
953         }
954         PetscCall(VecAssemblyBegin(locMaxEdge));
955         PetscCall(VecAssemblyEnd(locMaxEdge));
956         PetscCall(VecScatterBegin(mpimat->Mvctx, locMaxEdge, ghostMaxEdge, INSERT_VALUES, SCATTER_FORWARD));
957         PetscCall(VecScatterEnd(mpimat->Mvctx, locMaxEdge, ghostMaxEdge, INSERT_VALUES, SCATTER_FORWARD));
958         PetscCall(VecGetArrayRead(ghostMaxEdge, &sbuff));
959         for (int kk = 0; kk < num_ghosts; kk++) { lghost_matched[kk] = (PetscBool)(PetscRealPart(sbuff[kk]) != 0.0); }
960         PetscCall(VecRestoreArrayRead(ghostMaxEdge, &sbuff));
961       }
962       /* compute 'locMaxEdge' inside sub iteration b/c max weight can drop as neighbors are matched */
963       for (PetscInt kk = 0, gid = my0; kk < nloc; kk++, gid++) {
964         PetscReal      max_e = 0., tt;
965         PetscScalar    vval;
966         const PetscInt lid    = kk;
967         int            max_pe = rank, pe, n;
968         ii                    = matA->i;
969         n                     = ii[lid + 1] - ii[lid];
970         aj                    = PetscSafePointerPlusOffset(matA->j, ii[lid]);
971         ap                    = PetscSafePointerPlusOffset(matA->a, ii[lid]);
972         for (int jj = 0; jj < n; jj++) {
973           PetscInt lidj = aj[jj];
974           if (lid_matched[lidj]) continue; /* this is new - can change local max */
975           if (lidj != lid && PetscRealPart(ap[jj]) > max_e) max_e = PetscRealPart(ap[jj]);
976         }
977         if (lid_cprowID && (ix = lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */
978           ii = matB->compressedrow.i;
979           n  = ii[ix + 1] - ii[ix];
980           ap = matB->a + ii[ix];
981           aj = matB->j + ii[ix];
982           for (int jj = 0; jj < n; jj++) {
983             PetscInt lidj = aj[jj];
984             if (lghost_matched[lidj]) continue;
985             if ((tt = PetscRealPart(ap[jj])) > max_e) max_e = tt;
986           }
987         }
988         vval = (PetscScalar)max_e;
989         PetscCall(VecSetValues(locMaxEdge, 1, &gid, &vval, INSERT_VALUES)); /* set with GID */
990         // max PE with max edge
991         if (lid_cprowID && (ix = lid_cprowID[lid]) != -1) { /* if I have any ghost neighbors */
992           ii = matB->compressedrow.i;
993           n  = ii[ix + 1] - ii[ix];
994           ap = matB->a + ii[ix];
995           aj = matB->j + ii[ix];
996           for (int jj = 0; jj < n; jj++) {
997             PetscInt lidj = aj[jj];
998             if (lghost_matched[lidj]) continue;
999             if ((pe = lghost_pe[aj[jj]]) > max_pe && PetscRealPart(ap[jj]) >= max_e - MY_MEPS) { max_pe = pe; }
1000           }
1001         }
1002         vval = (PetscScalar)max_pe;
1003         PetscCall(VecSetValues(locMaxPE, 1, &gid, &vval, INSERT_VALUES));
1004       }
1005       PetscCall(VecAssemblyBegin(locMaxEdge));
1006       PetscCall(VecAssemblyEnd(locMaxEdge));
1007       PetscCall(VecAssemblyBegin(locMaxPE));
1008       PetscCall(VecAssemblyEnd(locMaxPE));
1009       /* compute 'lghost_max_ew' and 'lghost_max_pe' to get ready for next iteration*/
1010       if (isMPI) {
1011         const PetscScalar *buf;
1012         PetscCall(VecScatterBegin(mpimat->Mvctx, locMaxEdge, ghostMaxEdge, INSERT_VALUES, SCATTER_FORWARD));
1013         PetscCall(VecScatterEnd(mpimat->Mvctx, locMaxEdge, ghostMaxEdge, INSERT_VALUES, SCATTER_FORWARD));
1014         PetscCall(VecScatterBegin(mpimat->Mvctx, locMaxPE, ghostMaxPE, INSERT_VALUES, SCATTER_FORWARD));
1015         PetscCall(VecScatterEnd(mpimat->Mvctx, locMaxPE, ghostMaxPE, INSERT_VALUES, SCATTER_FORWARD));
1016         PetscCall(VecGetArrayRead(ghostMaxPE, &buf));
1017         for (int kk = 0; kk < num_ghosts; kk++) {
1018           lghost_max_pe[kk] = (PetscMPIInt)PetscRealPart(buf[kk]); // the MAX proc of the ghost now
1019         }
1020         PetscCall(VecRestoreArrayRead(ghostMaxPE, &buf));
1021       }
1022       // if no active edges, stop
1023       if (gn_act_n[0] < 1) break;
1024       // inc and check (self stopping iteration
1025       PetscCheck(old_num_edge != gn_act_n[0], PETSC_COMM_SELF, PETSC_ERR_SUP, "HEM stalled step %d/%d", sub_it + 1, n_sub_its);
1026       sub_it++;
1027       PetscCheck(sub_it < n_sub_its, PETSC_COMM_SELF, PETSC_ERR_SUP, "failed to finish HEM step %d/%d", sub_it + 1, n_sub_its);
1028       old_num_edge = gn_act_n[0];
1029     } /* sub_it loop */
1030     /* clean up iteration */
1031     PetscCall(PetscFree(Edges));
1032     if (mpimat) { // can be hoisted
1033       PetscCall(VecRestoreArrayRead(ghostMaxEdge, &lghost_max_ew));
1034       PetscCall(VecDestroy(&ghostMaxEdge));
1035       PetscCall(VecDestroy(&ghostMaxPE));
1036       PetscCall(PetscFree(lghost_pe));
1037       PetscCall(PetscFree(lghost_gid));
1038       PetscCall(PetscFree(lghost_matched));
1039       PetscCall(PetscFree(lghost_max_pe));
1040     }
1041     PetscCall(VecDestroy(&locMaxEdge));
1042     PetscCall(VecDestroy(&locMaxPE));
1043     /* create next graph */
1044     {
1045       Vec diag;
1046       /* add identity for unmatched vertices so they stay alive */
1047       for (PetscInt kk = 0, gid1, gid = my0; kk < nloc; kk++, gid++) {
1048         if (!lid_matched[kk]) {
1049           const PetscInt lid = kk;
1050           PetscCDIntNd  *pos;
1051           PetscCall(PetscCDGetHeadPos(agg_llists, lid, &pos));
1052           PetscCheck(pos, PETSC_COMM_SELF, PETSC_ERR_PLIB, "empty list in singleton: %d", (int)gid);
1053           PetscCall(PetscCDIntNdGetID(pos, &gid1));
1054           PetscCheck(gid1 == gid, PETSC_COMM_SELF, PETSC_ERR_PLIB, "first in list (%d) in singleton not %d", (int)gid1, (int)gid);
1055           PetscCall(MatSetValues(P, 1, &gid, 1, &gid, &one, INSERT_VALUES));
1056         }
1057       }
1058       PetscCall(MatAssemblyBegin(P, MAT_FINAL_ASSEMBLY));
1059       PetscCall(MatAssemblyEnd(P, MAT_FINAL_ASSEMBLY));
1060 
1061       /* project to make new graph with collapsed edges */
1062       PetscCall(MatPtAP(cMat, P, MAT_INITIAL_MATRIX, 1.0, &tMat));
1063       PetscCall(MatDestroy(&P));
1064       PetscCall(MatDestroy(&cMat));
1065       cMat = tMat;
1066       PetscCall(MatCreateVecs(cMat, &diag, NULL));
1067       PetscCall(MatGetDiagonal(cMat, diag));
1068       PetscCall(VecReciprocal(diag));
1069       PetscCall(VecSqrtAbs(diag));
1070       PetscCall(MatDiagonalScale(cMat, diag, diag));
1071       PetscCall(VecDestroy(&diag));
1072     }
1073   } /* coarsen iterator */
1074 
1075   /* make fake matrix with Mat->B only for smoothed agg QR. Need this if we make an aux graph (ie, PtAP) with k > 1 */
1076   if (size > 1) {
1077     Mat           mat;
1078     PetscCDIntNd *pos;
1079     PetscInt      NN, MM, jj = 0, mxsz = 0;
1080 
1081     for (int kk = 0; kk < nloc; kk++) {
1082       PetscCall(PetscCDCountAt(agg_llists, kk, &jj));
1083       if (jj > mxsz) mxsz = jj;
1084     }
1085     PetscCall(MatGetSize(a_Gmat, &MM, &NN));
1086     if (mxsz > MM - nloc) mxsz = MM - nloc;
1087     /* matrix of ghost adj for square graph */
1088     PetscCall(MatCreateAIJ(comm, nloc, nloc, PETSC_DETERMINE, PETSC_DETERMINE, 0, NULL, mxsz, NULL, &mat));
1089     for (PetscInt lid = 0, gid = my0; lid < nloc; lid++, gid++) {
1090       PetscCall(PetscCDGetHeadPos(agg_llists, lid, &pos));
1091       while (pos) {
1092         PetscInt gid1;
1093         PetscCall(PetscCDIntNdGetID(pos, &gid1));
1094         PetscCall(PetscCDGetNextPos(agg_llists, lid, &pos));
1095         if (gid1 < my0 || gid1 >= my0 + nloc) PetscCall(MatSetValues(mat, 1, &gid, 1, &gid1, &one, ADD_VALUES));
1096       }
1097     }
1098     PetscCall(MatAssemblyBegin(mat, MAT_FINAL_ASSEMBLY));
1099     PetscCall(MatAssemblyEnd(mat, MAT_FINAL_ASSEMBLY));
1100     PetscCall(PetscCDSetMat(agg_llists, mat));
1101     PetscCall(PetscCDDestroy(ghost_deleted_list));
1102     if (rbuff_sz) PetscCall(PetscFree(rbuff)); // always true
1103   }
1104   // move BCs into some node
1105   if (bc_list) {
1106     PetscCDIntNd *pos;
1107     PetscCall(PetscCDGetHeadPos(bc_list, 0, &pos));
1108     while (pos) {
1109       PetscInt gid1;
1110       PetscCall(PetscCDIntNdGetID(pos, &gid1));
1111       PetscCall(PetscCDGetNextPos(bc_list, 0, &pos));
1112       PetscCall(PetscCDAppendID(agg_llists, bc_agg, gid1));
1113     }
1114     PetscCall(PetscCDRemoveAllAt(bc_list, 0));
1115     PetscCall(PetscCDDestroy(bc_list));
1116   }
1117   {
1118     // check sizes -- all vertices must get in graph
1119     PetscInt sz, globalsz, MM;
1120     PetscCall(MatGetSize(a_Gmat, &MM, NULL));
1121     PetscCall(PetscCDCount(agg_llists, &sz));
1122     PetscCall(MPIU_Allreduce(&sz, &globalsz, 1, MPIU_INT, MPI_SUM, comm));
1123     PetscCheck(MM == globalsz, comm, PETSC_ERR_SUP, "lost %d equations ?", (int)(MM - globalsz));
1124   }
1125   // cleanup
1126   PetscCall(MatDestroy(&cMat));
1127   PetscCall(PetscFree(lid_cprowID));
1128   PetscCall(PetscFree(lid_max_pe));
1129   PetscCall(PetscFree(lid_matched));
1130   PetscCall(ISDestroy(&info_is));
1131   PetscFunctionReturn(PETSC_SUCCESS);
1132 }
1133 
1134 /*
1135    HEM coarsen, simple greedy.
1136 */
1137 static PetscErrorCode MatCoarsenApply_HEM(MatCoarsen coarse)
1138 {
1139   Mat mat = coarse->graph;
1140 
1141   PetscFunctionBegin;
1142   PetscCall(MatCoarsenApply_HEM_private(mat, coarse->max_it, coarse->threshold, &coarse->agg_lists));
1143 
1144   PetscFunctionReturn(PETSC_SUCCESS);
1145 }
1146 
1147 static PetscErrorCode MatCoarsenView_HEM(MatCoarsen coarse, PetscViewer viewer)
1148 {
1149   PetscMPIInt rank;
1150   PetscBool   iascii;
1151 
1152   PetscFunctionBegin;
1153   PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)coarse), &rank));
1154   PetscCall(PetscObjectTypeCompare((PetscObject)viewer, PETSCVIEWERASCII, &iascii));
1155   if (iascii) {
1156     PetscCDIntNd *pos, *pos2;
1157     PetscCall(PetscViewerASCIIPrintf(viewer, "%d matching steps with threshold = %g\n", (int)coarse->max_it, (double)coarse->threshold));
1158     PetscCall(PetscViewerASCIIPushSynchronized(viewer));
1159     for (PetscInt kk = 0; kk < coarse->agg_lists->size; kk++) {
1160       PetscCall(PetscCDGetHeadPos(coarse->agg_lists, kk, &pos));
1161       if ((pos2 = pos)) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "selected local %d: ", (int)kk));
1162       while (pos) {
1163         PetscInt gid1;
1164         PetscCall(PetscCDIntNdGetID(pos, &gid1));
1165         PetscCall(PetscCDGetNextPos(coarse->agg_lists, kk, &pos));
1166         PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, " %d ", (int)gid1));
1167       }
1168       if (pos2) PetscCall(PetscViewerASCIISynchronizedPrintf(viewer, "\n"));
1169     }
1170     PetscCall(PetscViewerFlush(viewer));
1171     PetscCall(PetscViewerASCIIPopSynchronized(viewer));
1172   }
1173   PetscFunctionReturn(PETSC_SUCCESS);
1174 }
1175 
1176 /*
1177   MatCoarsenCreate_HEM - A coarsener that uses HEM a simple greedy coarsener
1178 */
1179 PETSC_EXTERN PetscErrorCode MatCoarsenCreate_HEM(MatCoarsen coarse)
1180 {
1181   PetscFunctionBegin;
1182   coarse->ops->apply = MatCoarsenApply_HEM;
1183   coarse->ops->view  = MatCoarsenView_HEM;
1184   coarse->max_it     = 4;
1185   PetscFunctionReturn(PETSC_SUCCESS);
1186 }
1187