xref: /petsc/src/ksp/pc/impls/tfs/xxt.c (revision 2d30e087755efd99e28fdfe792ffbeb2ee1ea928)
1 
2 /*
3 Module Name: xxt
4 Module Info:
5 
6 author:  Henry M. Tufo III
7 e-mail:  hmt@asci.uchicago.edu
8 contact:
9 +--------------------------------+--------------------------------+
10 |MCS Division - Building 221     |Department of Computer Science  |
11 |Argonne National Laboratory     |Ryerson 152                     |
12 |9700 S. Cass Avenue             |The University of Chicago       |
13 |Argonne, IL  60439              |Chicago, IL  60637              |
14 |(630) 252-5354/5986 ph/fx       |(773) 702-6019/8487 ph/fx       |
15 +--------------------------------+--------------------------------+
16 
17 Last Modification: 3.20.01
18 */
19 #include <../src/ksp/pc/impls/tfs/tfs.h>
20 
21 #define LEFT  -1
22 #define RIGHT 1
23 #define BOTH  0
24 
25 typedef struct xxt_solver_info {
26   PetscInt      n, m, n_global, m_global;
27   PetscInt      nnz, max_nnz, msg_buf_sz;
28   PetscInt     *nsep, *lnsep, *fo, nfo, *stages;
29   PetscInt     *col_sz, *col_indices;
30   PetscScalar **col_vals, *x, *solve_uu, *solve_w;
31   PetscInt      nsolves;
32   PetscScalar   tot_solve_time;
33 } xxt_info;
34 
35 typedef struct matvec_info {
36   PetscInt     n, m, n_global, m_global;
37   PetscInt    *local2global;
38   PCTFS_gs_ADT PCTFS_gs_handle;
39   PetscErrorCode (*matvec)(struct matvec_info *, PetscScalar *, PetscScalar *);
40   void *grid_data;
41 } mv_info;
42 
43 struct xxt_CDT {
44   PetscInt  id;
45   PetscInt  ns;
46   PetscInt  level;
47   xxt_info *info;
48   mv_info  *mvi;
49 };
50 
51 static PetscInt n_xxt         = 0;
52 static PetscInt n_xxt_handles = 0;
53 
54 /* prototypes */
55 static PetscErrorCode do_xxt_solve(xxt_ADT xxt_handle, PetscScalar *rhs);
56 static PetscErrorCode check_handle(xxt_ADT xxt_handle);
57 static PetscErrorCode det_separators(xxt_ADT xxt_handle);
58 static PetscErrorCode do_matvec(mv_info *A, PetscScalar *v, PetscScalar *u);
59 static PetscErrorCode xxt_generate(xxt_ADT xxt_handle);
60 static PetscErrorCode do_xxt_factor(xxt_ADT xxt_handle);
61 static mv_info       *set_mvi(PetscInt *local2global, PetscInt n, PetscInt m, PetscErrorCode (*matvec)(mv_info *, PetscScalar *, PetscScalar *), void *grid_data);
62 
63 xxt_ADT XXT_new(void) {
64   xxt_ADT xxt_handle;
65 
66   /* rolling count on n_xxt ... pot. problem here */
67   n_xxt_handles++;
68   xxt_handle       = (xxt_ADT)malloc(sizeof(struct xxt_CDT));
69   xxt_handle->id   = ++n_xxt;
70   xxt_handle->info = NULL;
71   xxt_handle->mvi  = NULL;
72 
73   return (xxt_handle);
74 }
75 
76 PetscErrorCode XXT_factor(xxt_ADT   xxt_handle,                                           /* prev. allocated xxt  handle */
77                           PetscInt *local2global,                                         /* global column mapping       */
78                           PetscInt  n,                                                    /* local num rows              */
79                           PetscInt  m,                                                    /* local num cols              */
80                           PetscErrorCode (*matvec)(void *, PetscScalar *, PetscScalar *), /* b_loc=A_local.x_loc         */
81                           void *grid_data)                                                /* grid data for matvec        */
82 {
83   PCTFS_comm_init();
84   check_handle(xxt_handle);
85 
86   /* only 2^k for now and all nodes participating */
87   PetscCheck((1 << (xxt_handle->level = PCTFS_i_log2_num_nodes)) == PCTFS_num_nodes, PETSC_COMM_SELF, PETSC_ERR_PLIB, "only 2^k for now and MPI_COMM_WORLD!!! %d != %d", 1 << PCTFS_i_log2_num_nodes, PCTFS_num_nodes);
88 
89   /* space for X info */
90   xxt_handle->info = (xxt_info *)malloc(sizeof(xxt_info));
91 
92   /* set up matvec handles */
93   xxt_handle->mvi = set_mvi(local2global, n, m, (PetscErrorCode(*)(mv_info *, PetscScalar *, PetscScalar *))matvec, grid_data);
94 
95   /* matrix is assumed to be of full rank */
96   /* LATER we can reset to indicate rank def. */
97   xxt_handle->ns = 0;
98 
99   /* determine separators and generate firing order - NB xxt info set here */
100   det_separators(xxt_handle);
101 
102   return (do_xxt_factor(xxt_handle));
103 }
104 
105 PetscErrorCode XXT_solve(xxt_ADT xxt_handle, PetscScalar *x, PetscScalar *b) {
106   PCTFS_comm_init();
107   check_handle(xxt_handle);
108 
109   /* need to copy b into x? */
110   if (b) PCTFS_rvec_copy(x, b, xxt_handle->mvi->n);
111   return do_xxt_solve(xxt_handle, x);
112 }
113 
114 PetscInt XXT_free(xxt_ADT xxt_handle) {
115   PCTFS_comm_init();
116   check_handle(xxt_handle);
117   n_xxt_handles--;
118 
119   free(xxt_handle->info->nsep);
120   free(xxt_handle->info->lnsep);
121   free(xxt_handle->info->fo);
122   free(xxt_handle->info->stages);
123   free(xxt_handle->info->solve_uu);
124   free(xxt_handle->info->solve_w);
125   free(xxt_handle->info->x);
126   free(xxt_handle->info->col_vals);
127   free(xxt_handle->info->col_sz);
128   free(xxt_handle->info->col_indices);
129   free(xxt_handle->info);
130   free(xxt_handle->mvi->local2global);
131   PCTFS_gs_free(xxt_handle->mvi->PCTFS_gs_handle);
132   free(xxt_handle->mvi);
133   free(xxt_handle);
134 
135   /* if the check fails we nuke */
136   /* if NULL pointer passed to free we nuke */
137   /* if the calls to free fail that's not my problem */
138   return (0);
139 }
140 
141 /* This function is currently unused */
142 PetscErrorCode XXT_stats(xxt_ADT xxt_handle) {
143   PetscInt    op[]  = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_ADD, GL_MIN, GL_MAX, GL_ADD};
144   PetscInt    fop[] = {NON_UNIFORM, GL_MIN, GL_MAX, GL_ADD};
145   PetscInt    vals[9], work[9];
146   PetscScalar fvals[3], fwork[3];
147 
148   PetscFunctionBegin;
149   PetscCall(PCTFS_comm_init());
150   PetscCall(check_handle(xxt_handle));
151 
152   /* if factorization not done there are no stats */
153   if (!xxt_handle->info || !xxt_handle->mvi) {
154     if (!PCTFS_my_id) PetscCall(PetscPrintf(PETSC_COMM_WORLD, "XXT_stats() :: no stats available!\n"));
155     PetscFunctionReturn(0);
156   }
157 
158   vals[0] = vals[1] = vals[2] = xxt_handle->info->nnz;
159   vals[3] = vals[4] = vals[5] = xxt_handle->mvi->n;
160   vals[6] = vals[7] = vals[8] = xxt_handle->info->msg_buf_sz;
161   PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(op) - 1, op);
162 
163   fvals[0] = fvals[1] = fvals[2] = xxt_handle->info->tot_solve_time / xxt_handle->info->nsolves++;
164   PCTFS_grop(fvals, fwork, PETSC_STATIC_ARRAY_LENGTH(fop) - 1, fop);
165 
166   if (!PCTFS_my_id) {
167     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: min   xxt_nnz=%" PetscInt_FMT "\n", PCTFS_my_id, vals[0]));
168     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: max   xxt_nnz=%" PetscInt_FMT "\n", PCTFS_my_id, vals[1]));
169     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: avg   xxt_nnz=%g\n", PCTFS_my_id, (double)(1.0 * vals[2] / PCTFS_num_nodes)));
170     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: tot   xxt_nnz=%" PetscInt_FMT "\n", PCTFS_my_id, vals[2]));
171     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: xxt   C(2d)  =%g\n", PCTFS_my_id, (double)(vals[2] / (PetscPowReal(1.0 * vals[5], 1.5)))));
172     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: xxt   C(3d)  =%g\n", PCTFS_my_id, (double)(vals[2] / (PetscPowReal(1.0 * vals[5], 1.6667)))));
173     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: min   xxt_n  =%" PetscInt_FMT "\n", PCTFS_my_id, vals[3]));
174     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: max   xxt_n  =%" PetscInt_FMT "\n", PCTFS_my_id, vals[4]));
175     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: avg   xxt_n  =%g\n", PCTFS_my_id, (double)(1.0 * vals[5] / PCTFS_num_nodes)));
176     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: tot   xxt_n  =%" PetscInt_FMT "\n", PCTFS_my_id, vals[5]));
177     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: min   xxt_buf=%" PetscInt_FMT "\n", PCTFS_my_id, vals[6]));
178     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: max   xxt_buf=%" PetscInt_FMT "\n", PCTFS_my_id, vals[7]));
179     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: avg   xxt_buf=%g\n", PCTFS_my_id, (double)(1.0 * vals[8] / PCTFS_num_nodes)));
180     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: min   xxt_slv=%g\n", PCTFS_my_id, (double)PetscRealPart(fvals[0])));
181     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: max   xxt_slv=%g\n", PCTFS_my_id, (double)PetscRealPart(fvals[1])));
182     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%d :: avg   xxt_slv=%g\n", PCTFS_my_id, (double)PetscRealPart(fvals[2] / PCTFS_num_nodes)));
183   }
184   PetscFunctionReturn(0);
185 }
186 
187 /*
188 
189 Description: get A_local, local portion of global coarse matrix which
190 is a row dist. nxm matrix w/ n<m.
191    o my_ml holds address of ML struct associated w/A_local and coarse grid
192    o local2global holds global number of column i (i=0,...,m-1)
193    o local2global holds global number of row    i (i=0,...,n-1)
194    o mylocmatvec performs A_local . vec_local (note that gs is performed using
195    PCTFS_gs_init/gop).
196 
197 mylocmatvec = my_ml->Amat[grid_tag].matvec->external;
198 mylocmatvec (void :: void *data, double *in, double *out)
199 */
200 static PetscErrorCode do_xxt_factor(xxt_ADT xxt_handle) {
201   return xxt_generate(xxt_handle);
202 }
203 
204 static PetscErrorCode xxt_generate(xxt_ADT xxt_handle) {
205   PetscInt      i, j, k, idex;
206   PetscInt      dim, col;
207   PetscScalar  *u, *uu, *v, *z, *w, alpha, alpha_w;
208   PetscInt     *segs;
209   PetscInt      op[] = {GL_ADD, 0};
210   PetscInt      off, len;
211   PetscScalar  *x_ptr;
212   PetscInt     *iptr, flag;
213   PetscInt      start = 0, end, work;
214   PetscInt      op2[] = {GL_MIN, 0};
215   PCTFS_gs_ADT  PCTFS_gs_handle;
216   PetscInt     *nsep, *lnsep, *fo;
217   PetscInt      a_n            = xxt_handle->mvi->n;
218   PetscInt      a_m            = xxt_handle->mvi->m;
219   PetscInt     *a_local2global = xxt_handle->mvi->local2global;
220   PetscInt      level;
221   PetscInt      xxt_nnz = 0, xxt_max_nnz = 0;
222   PetscInt      n, m;
223   PetscInt     *col_sz, *col_indices, *stages;
224   PetscScalar **col_vals, *x;
225   PetscInt      n_global;
226   PetscBLASInt  i1  = 1, dlen;
227   PetscScalar   dm1 = -1.0;
228 
229   n               = xxt_handle->mvi->n;
230   nsep            = xxt_handle->info->nsep;
231   lnsep           = xxt_handle->info->lnsep;
232   fo              = xxt_handle->info->fo;
233   end             = lnsep[0];
234   level           = xxt_handle->level;
235   PCTFS_gs_handle = xxt_handle->mvi->PCTFS_gs_handle;
236 
237   /* is there a null space? */
238   /* LATER add in ability to detect null space by checking alpha */
239   for (i = 0, j = 0; i <= level; i++) j += nsep[i];
240 
241   m = j - xxt_handle->ns;
242   if (m != j) PetscCall(PetscPrintf(PETSC_COMM_WORLD, "xxt_generate() :: null space exists %" PetscInt_FMT " %" PetscInt_FMT " %" PetscInt_FMT "\n", m, j, xxt_handle->ns));
243 
244   /* get and initialize storage for x local         */
245   /* note that x local is nxm and stored by columns */
246   col_sz      = (PetscInt *)malloc(m * sizeof(PetscInt));
247   col_indices = (PetscInt *)malloc((2 * m + 1) * sizeof(PetscInt));
248   col_vals    = (PetscScalar **)malloc(m * sizeof(PetscScalar *));
249   for (i = j = 0; i < m; i++, j += 2) {
250     col_indices[j] = col_indices[j + 1] = col_sz[i] = -1;
251     col_vals[i]                                     = NULL;
252   }
253   col_indices[j] = -1;
254 
255   /* size of separators for each sub-hc working from bottom of tree to top */
256   /* this looks like nsep[]=segments */
257   stages = (PetscInt *)malloc((level + 1) * sizeof(PetscInt));
258   segs   = (PetscInt *)malloc((level + 1) * sizeof(PetscInt));
259   PCTFS_ivec_zero(stages, level + 1);
260   PCTFS_ivec_copy(segs, nsep, level + 1);
261   for (i = 0; i < level; i++) segs[i + 1] += segs[i];
262   stages[0] = segs[0];
263 
264   /* temporary vectors  */
265   u  = (PetscScalar *)malloc(n * sizeof(PetscScalar));
266   z  = (PetscScalar *)malloc(n * sizeof(PetscScalar));
267   v  = (PetscScalar *)malloc(a_m * sizeof(PetscScalar));
268   uu = (PetscScalar *)malloc(m * sizeof(PetscScalar));
269   w  = (PetscScalar *)malloc(m * sizeof(PetscScalar));
270 
271   /* extra nnz due to replication of vertices across separators */
272   for (i = 1, j = 0; i <= level; i++) j += nsep[i];
273 
274   /* storage for sparse x values */
275   n_global    = xxt_handle->info->n_global;
276   xxt_max_nnz = (PetscInt)(2.5 * PetscPowReal(1.0 * n_global, 1.6667) + j * n / 2) / PCTFS_num_nodes;
277   x           = (PetscScalar *)malloc(xxt_max_nnz * sizeof(PetscScalar));
278   xxt_nnz     = 0;
279 
280   /* LATER - can embed next sep to fire in gs */
281   /* time to make the donuts - generate X factor */
282   for (dim = i = j = 0; i < m; i++) {
283     /* time to move to the next level? */
284     while (i == segs[dim]) {
285       PetscCheck(dim != level, PETSC_COMM_SELF, PETSC_ERR_PLIB, "dim about to exceed level");
286       stages[dim++] = i;
287       end += lnsep[dim];
288     }
289     stages[dim] = i;
290 
291     /* which column are we firing? */
292     /* i.e. set v_l */
293     /* use new seps and do global min across hc to determine which one to fire */
294     (start < end) ? (col = fo[start]) : (col = INT_MAX);
295     PCTFS_giop_hc(&col, &work, 1, op2, dim);
296 
297     /* shouldn't need this */
298     if (col == INT_MAX) {
299       PetscCall(PetscInfo(0, "hey ... col==INT_MAX??\n"));
300       continue;
301     }
302 
303     /* do I own it? I should */
304     PCTFS_rvec_zero(v, a_m);
305     if (col == fo[start]) {
306       start++;
307       idex = PCTFS_ivec_linear_search(col, a_local2global, a_n);
308       if (idex != -1) {
309         v[idex] = 1.0;
310         j++;
311       } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "NOT FOUND!");
312     } else {
313       idex = PCTFS_ivec_linear_search(col, a_local2global, a_m);
314       if (idex != -1) v[idex] = 1.0;
315     }
316 
317     /* perform u = A.v_l */
318     PCTFS_rvec_zero(u, n);
319     do_matvec(xxt_handle->mvi, v, u);
320 
321     /* uu =  X^T.u_l (local portion) */
322     /* technically only need to zero out first i entries */
323     /* later turn this into an XXT_solve call ? */
324     PCTFS_rvec_zero(uu, m);
325     x_ptr = x;
326     iptr  = col_indices;
327     for (k = 0; k < i; k++) {
328       off = *iptr++;
329       len = *iptr++;
330       PetscCall(PetscBLASIntCast(len, &dlen));
331       PetscCallBLAS("BLASdot", uu[k] = BLASdot_(&dlen, u + off, &i1, x_ptr, &i1));
332       x_ptr += len;
333     }
334 
335     /* uu = X^T.u_l (comm portion) */
336     PetscCall(PCTFS_ssgl_radd(uu, w, dim, stages));
337 
338     /* z = X.uu */
339     PCTFS_rvec_zero(z, n);
340     x_ptr = x;
341     iptr  = col_indices;
342     for (k = 0; k < i; k++) {
343       off = *iptr++;
344       len = *iptr++;
345       PetscCall(PetscBLASIntCast(len, &dlen));
346       PetscCallBLAS("BLASaxpy", BLASaxpy_(&dlen, &uu[k], x_ptr, &i1, z + off, &i1));
347       x_ptr += len;
348     }
349 
350     /* compute v_l = v_l - z */
351     PCTFS_rvec_zero(v + a_n, a_m - a_n);
352     PetscCall(PetscBLASIntCast(n, &dlen));
353     PetscCallBLAS("BLASaxpy", BLASaxpy_(&dlen, &dm1, z, &i1, v, &i1));
354 
355     /* compute u_l = A.v_l */
356     if (a_n != a_m) PCTFS_gs_gop_hc(PCTFS_gs_handle, v, "+\0", dim);
357     PCTFS_rvec_zero(u, n);
358     do_matvec(xxt_handle->mvi, v, u);
359 
360     /* compute sqrt(alpha) = sqrt(v_l^T.u_l) - local portion */
361     PetscCall(PetscBLASIntCast(n, &dlen));
362     PetscCallBLAS("BLASdot", alpha = BLASdot_(&dlen, u, &i1, v, &i1));
363     /* compute sqrt(alpha) = sqrt(v_l^T.u_l) - comm portion */
364     PCTFS_grop_hc(&alpha, &alpha_w, 1, op, dim);
365 
366     alpha = (PetscScalar)PetscSqrtReal((PetscReal)alpha);
367 
368     /* check for small alpha                             */
369     /* LATER use this to detect and determine null space */
370     PetscCheck(PetscAbsScalar(alpha) >= 1.0e-14, PETSC_COMM_SELF, PETSC_ERR_PLIB, "bad alpha! %g", (double)PetscAbsScalar(alpha));
371 
372     /* compute v_l = v_l/sqrt(alpha) */
373     PCTFS_rvec_scale(v, 1.0 / alpha, n);
374 
375     /* add newly generated column, v_l, to X */
376     flag = 1;
377     off = len = 0;
378     for (k = 0; k < n; k++) {
379       if (v[k] != 0.0) {
380         len = k;
381         if (flag) {
382           off  = k;
383           flag = 0;
384         }
385       }
386     }
387 
388     len -= (off - 1);
389 
390     if (len > 0) {
391       if ((xxt_nnz + len) > xxt_max_nnz) {
392         PetscCall(PetscInfo(0, "increasing space for X by 2x!\n"));
393         xxt_max_nnz *= 2;
394         x_ptr = (PetscScalar *)malloc(xxt_max_nnz * sizeof(PetscScalar));
395         PCTFS_rvec_copy(x_ptr, x, xxt_nnz);
396         free(x);
397         x = x_ptr;
398         x_ptr += xxt_nnz;
399       }
400       xxt_nnz += len;
401       PCTFS_rvec_copy(x_ptr, v + off, len);
402 
403       col_indices[2 * i] = off;
404       col_sz[i] = col_indices[2 * i + 1] = len;
405       col_vals[i]                        = x_ptr;
406     } else {
407       col_indices[2 * i] = 0;
408       col_sz[i] = col_indices[2 * i + 1] = 0;
409       col_vals[i]                        = x_ptr;
410     }
411   }
412 
413   /* close off stages for execution phase */
414   while (dim != level) {
415     stages[dim++] = i;
416     PetscCall(PetscInfo(0, "disconnected!!! dim(%" PetscInt_FMT ")!=level(%" PetscInt_FMT ")\n", dim, level));
417   }
418   stages[dim] = i;
419 
420   xxt_handle->info->n              = xxt_handle->mvi->n;
421   xxt_handle->info->m              = m;
422   xxt_handle->info->nnz            = xxt_nnz;
423   xxt_handle->info->max_nnz        = xxt_max_nnz;
424   xxt_handle->info->msg_buf_sz     = stages[level] - stages[0];
425   xxt_handle->info->solve_uu       = (PetscScalar *)malloc(m * sizeof(PetscScalar));
426   xxt_handle->info->solve_w        = (PetscScalar *)malloc(m * sizeof(PetscScalar));
427   xxt_handle->info->x              = x;
428   xxt_handle->info->col_vals       = col_vals;
429   xxt_handle->info->col_sz         = col_sz;
430   xxt_handle->info->col_indices    = col_indices;
431   xxt_handle->info->stages         = stages;
432   xxt_handle->info->nsolves        = 0;
433   xxt_handle->info->tot_solve_time = 0.0;
434 
435   free(segs);
436   free(u);
437   free(v);
438   free(uu);
439   free(z);
440   free(w);
441 
442   return (0);
443 }
444 
445 static PetscErrorCode do_xxt_solve(xxt_ADT xxt_handle, PetscScalar *uc) {
446   PetscInt     off, len, *iptr;
447   PetscInt     level       = xxt_handle->level;
448   PetscInt     n           = xxt_handle->info->n;
449   PetscInt     m           = xxt_handle->info->m;
450   PetscInt    *stages      = xxt_handle->info->stages;
451   PetscInt    *col_indices = xxt_handle->info->col_indices;
452   PetscScalar *x_ptr, *uu_ptr;
453   PetscScalar *solve_uu = xxt_handle->info->solve_uu;
454   PetscScalar *solve_w  = xxt_handle->info->solve_w;
455   PetscScalar *x        = xxt_handle->info->x;
456   PetscBLASInt i1       = 1, dlen;
457 
458   PetscFunctionBegin;
459   uu_ptr = solve_uu;
460   PCTFS_rvec_zero(uu_ptr, m);
461 
462   /* x  = X.Y^T.b */
463   /* uu = Y^T.b */
464   for (x_ptr = x, iptr = col_indices; *iptr != -1; x_ptr += len) {
465     off = *iptr++;
466     len = *iptr++;
467     PetscCall(PetscBLASIntCast(len, &dlen));
468     PetscCallBLAS("BLASdot", *uu_ptr++ = BLASdot_(&dlen, uc + off, &i1, x_ptr, &i1));
469   }
470 
471   /* comunication of beta */
472   uu_ptr = solve_uu;
473   if (level) PetscCall(PCTFS_ssgl_radd(uu_ptr, solve_w, level, stages));
474 
475   PCTFS_rvec_zero(uc, n);
476 
477   /* x = X.uu */
478   for (x_ptr = x, iptr = col_indices; *iptr != -1; x_ptr += len) {
479     off = *iptr++;
480     len = *iptr++;
481     PetscCall(PetscBLASIntCast(len, &dlen));
482     PetscCallBLAS("BLASaxpy", BLASaxpy_(&dlen, uu_ptr++, x_ptr, &i1, uc + off, &i1));
483   }
484   PetscFunctionReturn(0);
485 }
486 
487 static PetscErrorCode check_handle(xxt_ADT xxt_handle) {
488   PetscInt vals[2], work[2], op[] = {NON_UNIFORM, GL_MIN, GL_MAX};
489 
490   PetscFunctionBegin;
491   PetscCheck(xxt_handle, PETSC_COMM_SELF, PETSC_ERR_PLIB, "check_handle() :: bad handle :: NULL %p", xxt_handle);
492 
493   vals[0] = vals[1] = xxt_handle->id;
494   PCTFS_giop(vals, work, PETSC_STATIC_ARRAY_LENGTH(op) - 1, op);
495   PetscCheck(!(vals[0] != vals[1]) && !(xxt_handle->id <= 0), PETSC_COMM_SELF, PETSC_ERR_PLIB, "check_handle() :: bad handle :: id mismatch min/max %" PetscInt_FMT "/%" PetscInt_FMT " %" PetscInt_FMT, vals[0], vals[1], xxt_handle->id);
496   PetscFunctionReturn(0);
497 }
498 
499 static PetscErrorCode det_separators(xxt_ADT xxt_handle) {
500   PetscInt     i, ct, id;
501   PetscInt     mask, edge, *iptr;
502   PetscInt    *dir, *used;
503   PetscInt     sum[4], w[4];
504   PetscScalar  rsum[4], rw[4];
505   PetscInt     op[] = {GL_ADD, 0};
506   PetscScalar *lhs, *rhs;
507   PetscInt    *nsep, *lnsep, *fo, nfo = 0;
508   PCTFS_gs_ADT PCTFS_gs_handle = xxt_handle->mvi->PCTFS_gs_handle;
509   PetscInt    *local2global    = xxt_handle->mvi->local2global;
510   PetscInt     n               = xxt_handle->mvi->n;
511   PetscInt     m               = xxt_handle->mvi->m;
512   PetscInt     level           = xxt_handle->level;
513   PetscInt     shared          = 0;
514 
515   PetscFunctionBegin;
516   dir   = (PetscInt *)malloc(sizeof(PetscInt) * (level + 1));
517   nsep  = (PetscInt *)malloc(sizeof(PetscInt) * (level + 1));
518   lnsep = (PetscInt *)malloc(sizeof(PetscInt) * (level + 1));
519   fo    = (PetscInt *)malloc(sizeof(PetscInt) * (n + 1));
520   used  = (PetscInt *)malloc(sizeof(PetscInt) * n);
521 
522   PCTFS_ivec_zero(dir, level + 1);
523   PCTFS_ivec_zero(nsep, level + 1);
524   PCTFS_ivec_zero(lnsep, level + 1);
525   PCTFS_ivec_set(fo, -1, n + 1);
526   PCTFS_ivec_zero(used, n);
527 
528   lhs = (PetscScalar *)malloc(sizeof(PetscScalar) * m);
529   rhs = (PetscScalar *)malloc(sizeof(PetscScalar) * m);
530 
531   /* determine the # of unique dof */
532   PCTFS_rvec_zero(lhs, m);
533   PCTFS_rvec_set(lhs, 1.0, n);
534   PCTFS_gs_gop_hc(PCTFS_gs_handle, lhs, "+\0", level);
535   PCTFS_rvec_zero(rsum, 2);
536   for (i = 0; i < n; i++) {
537     if (lhs[i] != 0.0) {
538       rsum[0] += 1.0 / lhs[i];
539       rsum[1] += lhs[i];
540     }
541   }
542   PCTFS_grop_hc(rsum, rw, 2, op, level);
543   rsum[0] += 0.1;
544   rsum[1] += 0.1;
545 
546   if (PetscAbsScalar(rsum[0] - rsum[1]) > EPS) shared = 1;
547 
548   xxt_handle->info->n_global = xxt_handle->info->m_global = (PetscInt)rsum[0];
549   xxt_handle->mvi->n_global = xxt_handle->mvi->m_global = (PetscInt)rsum[0];
550 
551   /* determine separator sets top down */
552   if (shared) {
553     for (iptr = fo + n, id = PCTFS_my_id, mask = PCTFS_num_nodes >> 1, edge = level; edge > 0; edge--, mask >>= 1) {
554       /* set rsh of hc, fire, and collect lhs responses */
555       (id < mask) ? PCTFS_rvec_zero(lhs, m) : PCTFS_rvec_set(lhs, 1.0, m);
556       PCTFS_gs_gop_hc(PCTFS_gs_handle, lhs, "+\0", edge);
557 
558       /* set lsh of hc, fire, and collect rhs responses */
559       (id < mask) ? PCTFS_rvec_set(rhs, 1.0, m) : PCTFS_rvec_zero(rhs, m);
560       PCTFS_gs_gop_hc(PCTFS_gs_handle, rhs, "+\0", edge);
561 
562       for (i = 0; i < n; i++) {
563         if (id < mask) {
564           if (lhs[i] != 0.0) lhs[i] = 1.0;
565         }
566         if (id >= mask) {
567           if (rhs[i] != 0.0) rhs[i] = 1.0;
568         }
569       }
570 
571       if (id < mask) PCTFS_gs_gop_hc(PCTFS_gs_handle, lhs, "+\0", edge - 1);
572       else PCTFS_gs_gop_hc(PCTFS_gs_handle, rhs, "+\0", edge - 1);
573 
574       /* count number of dofs I own that have signal and not in sep set */
575       PCTFS_rvec_zero(rsum, 4);
576       for (PCTFS_ivec_zero(sum, 4), ct = i = 0; i < n; i++) {
577         if (!used[i]) {
578           /* number of unmarked dofs on node */
579           ct++;
580           /* number of dofs to be marked on lhs hc */
581           if (id < mask) {
582             if (lhs[i] != 0.0) {
583               sum[0]++;
584               rsum[0] += 1.0 / lhs[i];
585             }
586           }
587           /* number of dofs to be marked on rhs hc */
588           if (id >= mask) {
589             if (rhs[i] != 0.0) {
590               sum[1]++;
591               rsum[1] += 1.0 / rhs[i];
592             }
593           }
594         }
595       }
596 
597       /* go for load balance - choose half with most unmarked dofs, bias LHS */
598       (id < mask) ? (sum[2] = ct) : (sum[3] = ct);
599       (id < mask) ? (rsum[2] = ct) : (rsum[3] = ct);
600       PCTFS_giop_hc(sum, w, 4, op, edge);
601       PCTFS_grop_hc(rsum, rw, 4, op, edge);
602       rsum[0] += 0.1;
603       rsum[1] += 0.1;
604       rsum[2] += 0.1;
605       rsum[3] += 0.1;
606 
607       if (id < mask) {
608         /* mark dofs I own that have signal and not in sep set */
609         for (ct = i = 0; i < n; i++) {
610           if ((!used[i]) && (lhs[i] != 0.0)) {
611             ct++;
612             nfo++;
613 
614             PetscCheck(nfo <= n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nfo about to exceed n");
615 
616             *--iptr = local2global[i];
617             used[i] = edge;
618           }
619         }
620         if (ct > 1) PCTFS_ivec_sort(iptr, ct);
621 
622         lnsep[edge] = ct;
623         nsep[edge]  = (PetscInt)rsum[0];
624         dir[edge]   = LEFT;
625       }
626 
627       if (id >= mask) {
628         /* mark dofs I own that have signal and not in sep set */
629         for (ct = i = 0; i < n; i++) {
630           if ((!used[i]) && (rhs[i] != 0.0)) {
631             ct++;
632             nfo++;
633 
634             PetscCheck(nfo <= n, PETSC_COMM_SELF, PETSC_ERR_PLIB, "nfo about to exceed n");
635 
636             *--iptr = local2global[i];
637             used[i] = edge;
638           }
639         }
640         if (ct > 1) PCTFS_ivec_sort(iptr, ct);
641 
642         lnsep[edge] = ct;
643         nsep[edge]  = (PetscInt)rsum[1];
644         dir[edge]   = RIGHT;
645       }
646 
647       /* LATER or we can recur on these to order seps at this level */
648       /* do we need full set of separators for this?                */
649 
650       /* fold rhs hc into lower */
651       if (id >= mask) id -= mask;
652     }
653   } else {
654     for (iptr = fo + n, id = PCTFS_my_id, mask = PCTFS_num_nodes >> 1, edge = level; edge > 0; edge--, mask >>= 1) {
655       /* set rsh of hc, fire, and collect lhs responses */
656       (id < mask) ? PCTFS_rvec_zero(lhs, m) : PCTFS_rvec_set(lhs, 1.0, m);
657       PCTFS_gs_gop_hc(PCTFS_gs_handle, lhs, "+\0", edge);
658 
659       /* set lsh of hc, fire, and collect rhs responses */
660       (id < mask) ? PCTFS_rvec_set(rhs, 1.0, m) : PCTFS_rvec_zero(rhs, m);
661       PCTFS_gs_gop_hc(PCTFS_gs_handle, rhs, "+\0", edge);
662 
663       /* count number of dofs I own that have signal and not in sep set */
664       for (PCTFS_ivec_zero(sum, 4), ct = i = 0; i < n; i++) {
665         if (!used[i]) {
666           /* number of unmarked dofs on node */
667           ct++;
668           /* number of dofs to be marked on lhs hc */
669           if ((id < mask) && (lhs[i] != 0.0)) sum[0]++;
670           /* number of dofs to be marked on rhs hc */
671           if ((id >= mask) && (rhs[i] != 0.0)) sum[1]++;
672         }
673       }
674 
675       /* go for load balance - choose half with most unmarked dofs, bias LHS */
676       (id < mask) ? (sum[2] = ct) : (sum[3] = ct);
677       PCTFS_giop_hc(sum, w, 4, op, edge);
678 
679       /* lhs hc wins */
680       if (sum[2] >= sum[3]) {
681         if (id < mask) {
682           /* mark dofs I own that have signal and not in sep set */
683           for (ct = i = 0; i < n; i++) {
684             if ((!used[i]) && (lhs[i] != 0.0)) {
685               ct++;
686               nfo++;
687               *--iptr = local2global[i];
688               used[i] = edge;
689             }
690           }
691           if (ct > 1) PCTFS_ivec_sort(iptr, ct);
692           lnsep[edge] = ct;
693         }
694         nsep[edge] = sum[0];
695         dir[edge]  = LEFT;
696       } else { /* rhs hc wins */
697         if (id >= mask) {
698           /* mark dofs I own that have signal and not in sep set */
699           for (ct = i = 0; i < n; i++) {
700             if ((!used[i]) && (rhs[i] != 0.0)) {
701               ct++;
702               nfo++;
703               *--iptr = local2global[i];
704               used[i] = edge;
705             }
706           }
707           if (ct > 1) PCTFS_ivec_sort(iptr, ct);
708           lnsep[edge] = ct;
709         }
710         nsep[edge] = sum[1];
711         dir[edge]  = RIGHT;
712       }
713       /* LATER or we can recur on these to order seps at this level */
714       /* do we need full set of separators for this?                */
715 
716       /* fold rhs hc into lower */
717       if (id >= mask) id -= mask;
718     }
719   }
720 
721   /* level 0 is on processor case - so mark the remainder */
722   for (ct = i = 0; i < n; i++) {
723     if (!used[i]) {
724       ct++;
725       nfo++;
726       *--iptr = local2global[i];
727       used[i] = edge;
728     }
729   }
730   if (ct > 1) PCTFS_ivec_sort(iptr, ct);
731   lnsep[edge] = ct;
732   nsep[edge]  = ct;
733   dir[edge]   = LEFT;
734 
735   xxt_handle->info->nsep  = nsep;
736   xxt_handle->info->lnsep = lnsep;
737   xxt_handle->info->fo    = fo;
738   xxt_handle->info->nfo   = nfo;
739 
740   free(dir);
741   free(lhs);
742   free(rhs);
743   free(used);
744   PetscFunctionReturn(0);
745 }
746 
747 static mv_info *set_mvi(PetscInt *local2global, PetscInt n, PetscInt m, PetscErrorCode (*matvec)(mv_info *, PetscScalar *, PetscScalar *), void *grid_data) {
748   mv_info *mvi;
749 
750   mvi               = (mv_info *)malloc(sizeof(mv_info));
751   mvi->n            = n;
752   mvi->m            = m;
753   mvi->n_global     = -1;
754   mvi->m_global     = -1;
755   mvi->local2global = (PetscInt *)malloc((m + 1) * sizeof(PetscInt));
756   PCTFS_ivec_copy(mvi->local2global, local2global, m);
757   mvi->local2global[m] = INT_MAX;
758   mvi->matvec          = matvec;
759   mvi->grid_data       = grid_data;
760 
761   /* set xxt communication handle to perform restricted matvec */
762   mvi->PCTFS_gs_handle = PCTFS_gs_init(local2global, m, PCTFS_num_nodes);
763 
764   return (mvi);
765 }
766 
767 static PetscErrorCode do_matvec(mv_info *A, PetscScalar *v, PetscScalar *u) {
768   PetscFunctionBegin;
769   A->matvec((mv_info *)A->grid_data, v, u);
770   PetscFunctionReturn(0);
771 }
772