xref: /petsc/src/ksp/pc/impls/h2opus/pch2opus.c (revision 76be6f4ff3bd4e251c19fc00ebbebfd58b6e7589)
1 #include <petsc/private/pcimpl.h>
2 #include <petsc/private/matimpl.h>
3 #include <h2opusconf.h>
4 
5 /* Use GPU only if H2OPUS is configured for GPU */
6 #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
7 #define PETSC_H2OPUS_USE_GPU
8 #endif
9 
10 typedef struct {
11   Mat         A;
12   Mat         M;
13   PetscScalar s0;
14 
15   /* sampler for Newton-Schultz */
16   Mat      S;
17   PetscInt hyperorder;
18   Vec      wns[4];
19   Mat      wnsmat[4];
20 
21   /* convergence testing */
22   Mat       T;
23   Vec       w;
24 
25   /* Support for PCSetCoordinates */
26   PetscInt  sdim;
27   PetscInt  nlocc;
28   PetscReal *coords;
29 
30   /* Newton-Schultz customization */
31   PetscInt  maxits;
32   PetscReal rtol,atol;
33   PetscBool monitor;
34   PetscBool useapproximatenorms;
35   NormType  normtype;
36 
37   /* Used when pmat != MATH2OPUS */
38   PetscReal eta;
39   PetscInt  leafsize;
40   PetscInt  max_rank;
41   PetscInt  bs;
42   PetscReal mrtol;
43 
44   /* CPU/GPU */
45   PetscBool forcecpu;
46   PetscBool boundtocpu;
47 } PC_H2OPUS;
48 
49 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat,NormType,PetscReal*);
50 
51 static PetscErrorCode PCReset_H2OPUS(PC pc)
52 {
53   PC_H2OPUS *pch2opus = (PC_H2OPUS*)pc->data;
54 
55   PetscFunctionBegin;
56   pch2opus->sdim  = 0;
57   pch2opus->nlocc = 0;
58   PetscCall(PetscFree(pch2opus->coords));
59   PetscCall(MatDestroy(&pch2opus->A));
60   PetscCall(MatDestroy(&pch2opus->M));
61   PetscCall(MatDestroy(&pch2opus->T));
62   PetscCall(VecDestroy(&pch2opus->w));
63   PetscCall(MatDestroy(&pch2opus->S));
64   PetscCall(VecDestroy(&pch2opus->wns[0]));
65   PetscCall(VecDestroy(&pch2opus->wns[1]));
66   PetscCall(VecDestroy(&pch2opus->wns[2]));
67   PetscCall(VecDestroy(&pch2opus->wns[3]));
68   PetscCall(MatDestroy(&pch2opus->wnsmat[0]));
69   PetscCall(MatDestroy(&pch2opus->wnsmat[1]));
70   PetscCall(MatDestroy(&pch2opus->wnsmat[2]));
71   PetscCall(MatDestroy(&pch2opus->wnsmat[3]));
72   PetscFunctionReturn(0);
73 }
74 
75 static PetscErrorCode PCSetCoordinates_H2OPUS(PC pc, PetscInt sdim, PetscInt nlocc, PetscReal *coords)
76 {
77   PC_H2OPUS *pch2opus = (PC_H2OPUS*)pc->data;
78   PetscBool  reset    = PETSC_TRUE;
79 
80   PetscFunctionBegin;
81   if (pch2opus->sdim && sdim == pch2opus->sdim && nlocc == pch2opus->nlocc) {
82     PetscCall(PetscArraycmp(pch2opus->coords,coords,sdim*nlocc,&reset));
83     reset = (PetscBool)!reset;
84   }
85   PetscCall(MPIU_Allreduce(MPI_IN_PLACE,&reset,1,MPIU_BOOL,MPI_LOR,PetscObjectComm((PetscObject)pc)));
86   if (reset) {
87     PetscCall(PCReset_H2OPUS(pc));
88     PetscCall(PetscMalloc1(sdim*nlocc,&pch2opus->coords));
89     PetscCall(PetscArraycpy(pch2opus->coords,coords,sdim*nlocc));
90     pch2opus->sdim  = sdim;
91     pch2opus->nlocc = nlocc;
92   }
93   PetscFunctionReturn(0);
94 }
95 
96 static PetscErrorCode PCDestroy_H2OPUS(PC pc)
97 {
98   PetscFunctionBegin;
99   PetscCall(PCReset_H2OPUS(pc));
100   PetscCall(PetscObjectComposeFunction((PetscObject)pc,"PCSetCoordinates_C",NULL));
101   PetscCall(PetscFree(pc->data));
102   PetscFunctionReturn(0);
103 }
104 
105 static PetscErrorCode PCSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,PC pc)
106 {
107   PC_H2OPUS *pch2opus = (PC_H2OPUS*)pc->data;
108 
109   PetscFunctionBegin;
110   PetscOptionsHeadBegin(PetscOptionsObject,"H2OPUS options");
111   PetscCall(PetscOptionsInt("-pc_h2opus_maxits","Maximum number of iterations for Newton-Schultz",NULL,pch2opus->maxits,&pch2opus->maxits,NULL));
112   PetscCall(PetscOptionsBool("-pc_h2opus_monitor","Monitor Newton-Schultz convergence",NULL,pch2opus->monitor,&pch2opus->monitor,NULL));
113   PetscCall(PetscOptionsReal("-pc_h2opus_atol","Absolute tolerance",NULL,pch2opus->atol,&pch2opus->atol,NULL));
114   PetscCall(PetscOptionsReal("-pc_h2opus_rtol","Relative tolerance",NULL,pch2opus->rtol,&pch2opus->rtol,NULL));
115   PetscCall(PetscOptionsEnum("-pc_h2opus_norm_type","Norm type for convergence monitoring",NULL,NormTypes,(PetscEnum)pch2opus->normtype,(PetscEnum*)&pch2opus->normtype,NULL));
116   PetscCall(PetscOptionsInt("-pc_h2opus_hyperorder","Hyper power order of sampling",NULL,pch2opus->hyperorder,&pch2opus->hyperorder,NULL));
117   PetscCall(PetscOptionsInt("-pc_h2opus_leafsize","Leaf size when constructed from kernel",NULL,pch2opus->leafsize,&pch2opus->leafsize,NULL));
118   PetscCall(PetscOptionsReal("-pc_h2opus_eta","Admissibility condition tolerance",NULL,pch2opus->eta,&pch2opus->eta,NULL));
119   PetscCall(PetscOptionsInt("-pc_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,pch2opus->max_rank,&pch2opus->max_rank,NULL));
120   PetscCall(PetscOptionsInt("-pc_h2opus_samples","Number of samples to be taken concurrently when constructing from matvecs",NULL,pch2opus->bs,&pch2opus->bs,NULL));
121   PetscCall(PetscOptionsReal("-pc_h2opus_mrtol","Relative tolerance for construction from sampling",NULL,pch2opus->mrtol,&pch2opus->mrtol,NULL));
122   PetscCall(PetscOptionsBool("-pc_h2opus_forcecpu","Force construction on CPU",NULL,pch2opus->forcecpu,&pch2opus->forcecpu,NULL));
123   PetscOptionsHeadEnd();
124   PetscFunctionReturn(0);
125 }
126 
127 typedef struct {
128   Mat A;
129   Mat M;
130   Vec w;
131 } AAtCtx;
132 
133 static PetscErrorCode MatMult_AAt(Mat A, Vec x, Vec y)
134 {
135   AAtCtx *aat;
136 
137   PetscFunctionBegin;
138   PetscCall(MatShellGetContext(A,(void*)&aat));
139   /* PetscCall(MatMultTranspose(aat->M,x,aat->w)); */
140   PetscCall(MatMultTranspose(aat->A,x,aat->w));
141   PetscCall(MatMult(aat->A,aat->w,y));
142   PetscFunctionReturn(0);
143 }
144 
145 static PetscErrorCode PCH2OpusSetUpInit(PC pc)
146 {
147   PC_H2OPUS *pch2opus = (PC_H2OPUS*)pc->data;
148   Mat        A        = pc->useAmat ? pc->mat : pc->pmat, AAt;
149   PetscInt   M,m;
150   VecType    vtype;
151   PetscReal  n;
152   AAtCtx     aat;
153 
154   PetscFunctionBegin;
155   aat.A = A;
156   aat.M = pch2opus->M; /* unused so far */
157   PetscCall(MatCreateVecs(A,NULL,&aat.w));
158   PetscCall(MatGetSize(A,&M,NULL));
159   PetscCall(MatGetLocalSize(A,&m,NULL));
160   PetscCall(MatCreateShell(PetscObjectComm((PetscObject)A),m,m,M,M,&aat,&AAt));
161   PetscCall(MatBindToCPU(AAt,pch2opus->boundtocpu));
162   PetscCall(MatShellSetOperation(AAt,MATOP_MULT,(void (*)(void))MatMult_AAt));
163   PetscCall(MatShellSetOperation(AAt,MATOP_MULT_TRANSPOSE,(void (*)(void))MatMult_AAt));
164   PetscCall(MatShellSetOperation(AAt,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
165   PetscCall(MatGetVecType(A,&vtype));
166   PetscCall(MatShellSetVecType(AAt,vtype));
167   PetscCall(MatNorm(AAt,NORM_1,&n));
168   pch2opus->s0 = 1./n;
169   PetscCall(VecDestroy(&aat.w));
170   PetscCall(MatDestroy(&AAt));
171   PetscFunctionReturn(0);
172 }
173 
174 static PetscErrorCode PCApplyKernel_H2OPUS(PC pc, Vec x, Vec y, PetscBool t)
175 {
176   PC_H2OPUS *pch2opus = (PC_H2OPUS*)pc->data;
177 
178   PetscFunctionBegin;
179   if (t) PetscCall(MatMultTranspose(pch2opus->M,x,y));
180   else  PetscCall(MatMult(pch2opus->M,x,y));
181   PetscFunctionReturn(0);
182 }
183 
184 static PetscErrorCode PCApplyMatKernel_H2OPUS(PC pc, Mat X, Mat Y, PetscBool t)
185 {
186   PC_H2OPUS *pch2opus = (PC_H2OPUS*)pc->data;
187 
188   PetscFunctionBegin;
189   if (t) PetscCall(MatTransposeMatMult(pch2opus->M,X,MAT_REUSE_MATRIX,PETSC_DEFAULT,&Y));
190   else   PetscCall(MatMatMult(pch2opus->M,X,MAT_REUSE_MATRIX,PETSC_DEFAULT,&Y));
191   PetscFunctionReturn(0);
192 }
193 
194 static PetscErrorCode PCApplyMat_H2OPUS(PC pc, Mat X, Mat Y)
195 {
196   PetscFunctionBegin;
197   PetscCall(PCApplyMatKernel_H2OPUS(pc,X,Y,PETSC_FALSE));
198   PetscFunctionReturn(0);
199 }
200 
201 static PetscErrorCode PCApplyTransposeMat_H2OPUS(PC pc, Mat X, Mat Y)
202 {
203   PetscFunctionBegin;
204   PetscCall(PCApplyMatKernel_H2OPUS(pc,X,Y,PETSC_TRUE));
205   PetscFunctionReturn(0);
206 }
207 
208 static PetscErrorCode PCApply_H2OPUS(PC pc, Vec x, Vec y)
209 {
210   PetscFunctionBegin;
211   PetscCall(PCApplyKernel_H2OPUS(pc,x,y,PETSC_FALSE));
212   PetscFunctionReturn(0);
213 }
214 
215 static PetscErrorCode PCApplyTranspose_H2OPUS(PC pc, Vec x, Vec y)
216 {
217   PetscFunctionBegin;
218   PetscCall(PCApplyKernel_H2OPUS(pc,x,y,PETSC_TRUE));
219   PetscFunctionReturn(0);
220 }
221 
222 /* used to test the norm of (M^-1 A - I) */
223 static PetscErrorCode MatMultKernel_MAmI(Mat M, Vec x, Vec y, PetscBool t)
224 {
225   PC         pc;
226   Mat        A;
227   PC_H2OPUS *pch2opus;
228   PetscBool  sideleft = PETSC_TRUE;
229 
230   PetscFunctionBegin;
231   PetscCall(MatShellGetContext(M,(void*)&pc));
232   pch2opus = (PC_H2OPUS*)pc->data;
233   if (!pch2opus->w) PetscCall(MatCreateVecs(pch2opus->M,&pch2opus->w,NULL));
234   A = pch2opus->A;
235   PetscCall(VecBindToCPU(pch2opus->w,pch2opus->boundtocpu));
236   if (t) {
237     if (sideleft) {
238       PetscCall(PCApplyTranspose_H2OPUS(pc,x,pch2opus->w));
239       PetscCall(MatMultTranspose(A,pch2opus->w,y));
240     } else {
241       PetscCall(MatMultTranspose(A,x,pch2opus->w));
242       PetscCall(PCApplyTranspose_H2OPUS(pc,pch2opus->w,y));
243     }
244   } else {
245     if (sideleft) {
246       PetscCall(MatMult(A,x,pch2opus->w));
247       PetscCall(PCApply_H2OPUS(pc,pch2opus->w,y));
248     } else {
249       PetscCall(PCApply_H2OPUS(pc,x,pch2opus->w));
250       PetscCall(MatMult(A,pch2opus->w,y));
251     }
252   }
253   PetscCall(VecAXPY(y,-1.0,x));
254   PetscFunctionReturn(0);
255 }
256 
257 static PetscErrorCode MatMult_MAmI(Mat A, Vec x, Vec y)
258 {
259   PetscFunctionBegin;
260   PetscCall(MatMultKernel_MAmI(A,x,y,PETSC_FALSE));
261   PetscFunctionReturn(0);
262 }
263 
264 static PetscErrorCode MatMultTranspose_MAmI(Mat A, Vec x, Vec y)
265 {
266   PetscFunctionBegin;
267   PetscCall(MatMultKernel_MAmI(A,x,y,PETSC_TRUE));
268   PetscFunctionReturn(0);
269 }
270 
271 /* HyperPower kernel:
272 Y = R = x
273 for i = 1 . . . l - 1 do
274   R = (I - A * Xk) * R
275   Y = Y + R
276 Y = Xk * Y
277 */
278 static PetscErrorCode MatMultKernel_Hyper(Mat M, Vec x, Vec y, PetscBool t)
279 {
280   PC         pc;
281   Mat        A;
282   PC_H2OPUS *pch2opus;
283 
284   PetscFunctionBegin;
285   PetscCall(MatShellGetContext(M,(void*)&pc));
286   pch2opus = (PC_H2OPUS*)pc->data;
287   A = pch2opus->A;
288   PetscCall(MatCreateVecs(pch2opus->M,pch2opus->wns[0] ? NULL : &pch2opus->wns[0],pch2opus->wns[1] ? NULL : &pch2opus->wns[1]));
289   PetscCall(MatCreateVecs(pch2opus->M,pch2opus->wns[2] ? NULL : &pch2opus->wns[2],pch2opus->wns[3] ? NULL : &pch2opus->wns[3]));
290   PetscCall(VecBindToCPU(pch2opus->wns[0],pch2opus->boundtocpu));
291   PetscCall(VecBindToCPU(pch2opus->wns[1],pch2opus->boundtocpu));
292   PetscCall(VecBindToCPU(pch2opus->wns[2],pch2opus->boundtocpu));
293   PetscCall(VecBindToCPU(pch2opus->wns[3],pch2opus->boundtocpu));
294   PetscCall(VecCopy(x,pch2opus->wns[0]));
295   PetscCall(VecCopy(x,pch2opus->wns[3]));
296   if (t) {
297     for (PetscInt i=0;i<pch2opus->hyperorder-1;i++) {
298       PetscCall(MatMultTranspose(A,pch2opus->wns[0],pch2opus->wns[1]));
299       PetscCall(PCApplyTranspose_H2OPUS(pc,pch2opus->wns[1],pch2opus->wns[2]));
300       PetscCall(VecAXPY(pch2opus->wns[0],-1.,pch2opus->wns[2]));
301       PetscCall(VecAXPY(pch2opus->wns[3], 1.,pch2opus->wns[0]));
302     }
303     PetscCall(PCApplyTranspose_H2OPUS(pc,pch2opus->wns[3],y));
304   } else {
305     for (PetscInt i=0;i<pch2opus->hyperorder-1;i++) {
306       PetscCall(PCApply_H2OPUS(pc,pch2opus->wns[0],pch2opus->wns[1]));
307       PetscCall(MatMult(A,pch2opus->wns[1],pch2opus->wns[2]));
308       PetscCall(VecAXPY(pch2opus->wns[0],-1.,pch2opus->wns[2]));
309       PetscCall(VecAXPY(pch2opus->wns[3], 1.,pch2opus->wns[0]));
310     }
311     PetscCall(PCApply_H2OPUS(pc,pch2opus->wns[3],y));
312   }
313   PetscFunctionReturn(0);
314 }
315 
316 static PetscErrorCode MatMult_Hyper(Mat M, Vec x, Vec y)
317 {
318   PetscFunctionBegin;
319   PetscCall(MatMultKernel_Hyper(M,x,y,PETSC_FALSE));
320   PetscFunctionReturn(0);
321 }
322 
323 static PetscErrorCode MatMultTranspose_Hyper(Mat M, Vec x, Vec y)
324 {
325   PetscFunctionBegin;
326   PetscCall(MatMultKernel_Hyper(M,x,y,PETSC_TRUE));
327   PetscFunctionReturn(0);
328 }
329 
330 /* Hyper power kernel, MatMat version */
331 static PetscErrorCode MatMatMultKernel_Hyper(Mat M, Mat X, Mat Y, PetscBool t)
332 {
333   PC         pc;
334   Mat        A;
335   PC_H2OPUS *pch2opus;
336   PetscInt   i;
337 
338   PetscFunctionBegin;
339   PetscCall(MatShellGetContext(M,(void*)&pc));
340   pch2opus = (PC_H2OPUS*)pc->data;
341   A = pch2opus->A;
342   if (pch2opus->wnsmat[0] && pch2opus->wnsmat[0]->cmap->N != X->cmap->N) {
343     PetscCall(MatDestroy(&pch2opus->wnsmat[0]));
344     PetscCall(MatDestroy(&pch2opus->wnsmat[1]));
345   }
346   if (!pch2opus->wnsmat[0]) {
347     PetscCall(MatDuplicate(X,MAT_SHARE_NONZERO_PATTERN,&pch2opus->wnsmat[0]));
348     PetscCall(MatDuplicate(Y,MAT_SHARE_NONZERO_PATTERN,&pch2opus->wnsmat[1]));
349   }
350   if (pch2opus->wnsmat[2] && pch2opus->wnsmat[2]->cmap->N != X->cmap->N) {
351     PetscCall(MatDestroy(&pch2opus->wnsmat[2]));
352     PetscCall(MatDestroy(&pch2opus->wnsmat[3]));
353   }
354   if (!pch2opus->wnsmat[2]) {
355     PetscCall(MatDuplicate(X,MAT_SHARE_NONZERO_PATTERN,&pch2opus->wnsmat[2]));
356     PetscCall(MatDuplicate(Y,MAT_SHARE_NONZERO_PATTERN,&pch2opus->wnsmat[3]));
357   }
358   PetscCall(MatCopy(X,pch2opus->wnsmat[0],SAME_NONZERO_PATTERN));
359   PetscCall(MatCopy(X,pch2opus->wnsmat[3],SAME_NONZERO_PATTERN));
360   if (t) {
361     for (i=0;i<pch2opus->hyperorder-1;i++) {
362       PetscCall(MatTransposeMatMult(A,pch2opus->wnsmat[0],MAT_REUSE_MATRIX,PETSC_DEFAULT,&pch2opus->wnsmat[1]));
363       PetscCall(PCApplyTransposeMat_H2OPUS(pc,pch2opus->wnsmat[1],pch2opus->wnsmat[2]));
364       PetscCall(MatAXPY(pch2opus->wnsmat[0],-1.,pch2opus->wnsmat[2],SAME_NONZERO_PATTERN));
365       PetscCall(MatAXPY(pch2opus->wnsmat[3],1.,pch2opus->wnsmat[0],SAME_NONZERO_PATTERN));
366     }
367     PetscCall(PCApplyTransposeMat_H2OPUS(pc,pch2opus->wnsmat[3],Y));
368   } else {
369     for (i=0;i<pch2opus->hyperorder-1;i++) {
370       PetscCall(PCApplyMat_H2OPUS(pc,pch2opus->wnsmat[0],pch2opus->wnsmat[1]));
371       PetscCall(MatMatMult(A,pch2opus->wnsmat[1],MAT_REUSE_MATRIX,PETSC_DEFAULT,&pch2opus->wnsmat[2]));
372       PetscCall(MatAXPY(pch2opus->wnsmat[0],-1.,pch2opus->wnsmat[2],SAME_NONZERO_PATTERN));
373       PetscCall(MatAXPY(pch2opus->wnsmat[3],1.,pch2opus->wnsmat[0],SAME_NONZERO_PATTERN));
374     }
375     PetscCall(PCApplyMat_H2OPUS(pc,pch2opus->wnsmat[3],Y));
376   }
377   PetscFunctionReturn(0);
378 }
379 
380 static PetscErrorCode MatMatMultNumeric_Hyper(Mat M, Mat X, Mat Y,void *ctx)
381 {
382   PetscFunctionBegin;
383   PetscCall(MatMatMultKernel_Hyper(M,X,Y,PETSC_FALSE));
384   PetscFunctionReturn(0);
385 }
386 
387 /* Basic Newton-Schultz sampler: (2 * I - M * A)*M */
388 static PetscErrorCode MatMultKernel_NS(Mat M, Vec x, Vec y, PetscBool t)
389 {
390   PC         pc;
391   Mat        A;
392   PC_H2OPUS *pch2opus;
393 
394   PetscFunctionBegin;
395   PetscCall(MatShellGetContext(M,(void*)&pc));
396   pch2opus = (PC_H2OPUS*)pc->data;
397   A = pch2opus->A;
398   PetscCall(MatCreateVecs(pch2opus->M,pch2opus->wns[0] ? NULL : &pch2opus->wns[0],pch2opus->wns[1] ? NULL : &pch2opus->wns[1]));
399   PetscCall(VecBindToCPU(pch2opus->wns[0],pch2opus->boundtocpu));
400   PetscCall(VecBindToCPU(pch2opus->wns[1],pch2opus->boundtocpu));
401   if (t) {
402     PetscCall(PCApplyTranspose_H2OPUS(pc,x,y));
403     PetscCall(MatMultTranspose(A,y,pch2opus->wns[1]));
404     PetscCall(PCApplyTranspose_H2OPUS(pc,pch2opus->wns[1],pch2opus->wns[0]));
405     PetscCall(VecAXPBY(y,-1.,2.,pch2opus->wns[0]));
406   } else {
407     PetscCall(PCApply_H2OPUS(pc,x,y));
408     PetscCall(MatMult(A,y,pch2opus->wns[0]));
409     PetscCall(PCApply_H2OPUS(pc,pch2opus->wns[0],pch2opus->wns[1]));
410     PetscCall(VecAXPBY(y,-1.,2.,pch2opus->wns[1]));
411   }
412   PetscFunctionReturn(0);
413 }
414 
415 static PetscErrorCode MatMult_NS(Mat M, Vec x, Vec y)
416 {
417   PetscFunctionBegin;
418   PetscCall(MatMultKernel_NS(M,x,y,PETSC_FALSE));
419   PetscFunctionReturn(0);
420 }
421 
422 static PetscErrorCode MatMultTranspose_NS(Mat M, Vec x, Vec y)
423 {
424   PetscFunctionBegin;
425   PetscCall(MatMultKernel_NS(M,x,y,PETSC_TRUE));
426   PetscFunctionReturn(0);
427 }
428 
429 /* Basic Newton-Schultz sampler: (2 * I - M * A)*M, MatMat version */
430 static PetscErrorCode MatMatMultKernel_NS(Mat M, Mat X, Mat Y, PetscBool t)
431 {
432   PC         pc;
433   Mat        A;
434   PC_H2OPUS *pch2opus;
435 
436   PetscFunctionBegin;
437   PetscCall(MatShellGetContext(M,(void*)&pc));
438   pch2opus = (PC_H2OPUS*)pc->data;
439   A = pch2opus->A;
440   if (pch2opus->wnsmat[0] && pch2opus->wnsmat[0]->cmap->N != X->cmap->N) {
441     PetscCall(MatDestroy(&pch2opus->wnsmat[0]));
442     PetscCall(MatDestroy(&pch2opus->wnsmat[1]));
443   }
444   if (!pch2opus->wnsmat[0]) {
445     PetscCall(MatDuplicate(X,MAT_SHARE_NONZERO_PATTERN,&pch2opus->wnsmat[0]));
446     PetscCall(MatDuplicate(Y,MAT_SHARE_NONZERO_PATTERN,&pch2opus->wnsmat[1]));
447   }
448   if (t) {
449     PetscCall(PCApplyTransposeMat_H2OPUS(pc,X,Y));
450     PetscCall(MatTransposeMatMult(A,Y,MAT_REUSE_MATRIX,PETSC_DEFAULT,&pch2opus->wnsmat[1]));
451     PetscCall(PCApplyTransposeMat_H2OPUS(pc,pch2opus->wnsmat[1],pch2opus->wnsmat[0]));
452     PetscCall(MatScale(Y,2.));
453     PetscCall(MatAXPY(Y,-1.,pch2opus->wnsmat[0],SAME_NONZERO_PATTERN));
454   } else {
455     PetscCall(PCApplyMat_H2OPUS(pc,X,Y));
456     PetscCall(MatMatMult(A,Y,MAT_REUSE_MATRIX,PETSC_DEFAULT,&pch2opus->wnsmat[0]));
457     PetscCall(PCApplyMat_H2OPUS(pc,pch2opus->wnsmat[0],pch2opus->wnsmat[1]));
458     PetscCall(MatScale(Y,2.));
459     PetscCall(MatAXPY(Y,-1.,pch2opus->wnsmat[1],SAME_NONZERO_PATTERN));
460   }
461   PetscFunctionReturn(0);
462 }
463 
464 static PetscErrorCode MatMatMultNumeric_NS(Mat M, Mat X, Mat Y, void *ctx)
465 {
466   PetscFunctionBegin;
467   PetscCall(MatMatMultKernel_NS(M,X,Y,PETSC_FALSE));
468   PetscFunctionReturn(0);
469 }
470 
471 static PetscErrorCode PCH2OpusSetUpSampler_Private(PC pc)
472 {
473   PC_H2OPUS *pch2opus = (PC_H2OPUS*)pc->data;
474   Mat        A        = pc->useAmat ? pc->mat : pc->pmat;
475 
476   PetscFunctionBegin;
477   if (!pch2opus->S) {
478     PetscInt M,N,m,n;
479 
480     PetscCall(MatGetSize(A,&M,&N));
481     PetscCall(MatGetLocalSize(A,&m,&n));
482     PetscCall(MatCreateShell(PetscObjectComm((PetscObject)A),m,n,M,N,pc,&pch2opus->S));
483     PetscCall(MatSetBlockSizesFromMats(pch2opus->S,A,A));
484 #if defined(PETSC_H2OPUS_USE_GPU)
485     PetscCall(MatShellSetVecType(pch2opus->S,VECCUDA));
486 #endif
487   }
488   if (pch2opus->hyperorder >= 2) {
489     PetscCall(MatShellSetOperation(pch2opus->S,MATOP_MULT,(void (*)(void))MatMult_Hyper));
490     PetscCall(MatShellSetOperation(pch2opus->S,MATOP_MULT_TRANSPOSE,(void (*)(void))MatMultTranspose_Hyper));
491     PetscCall(MatShellSetMatProductOperation(pch2opus->S,MATPRODUCT_AB,NULL,MatMatMultNumeric_Hyper,NULL,MATDENSE,MATDENSE));
492     PetscCall(MatShellSetMatProductOperation(pch2opus->S,MATPRODUCT_AB,NULL,MatMatMultNumeric_Hyper,NULL,MATDENSECUDA,MATDENSECUDA));
493   } else {
494     PetscCall(MatShellSetOperation(pch2opus->S,MATOP_MULT,(void (*)(void))MatMult_NS));
495     PetscCall(MatShellSetOperation(pch2opus->S,MATOP_MULT_TRANSPOSE,(void (*)(void))MatMultTranspose_NS));
496     PetscCall(MatShellSetMatProductOperation(pch2opus->S,MATPRODUCT_AB,NULL,MatMatMultNumeric_NS,NULL,MATDENSE,MATDENSE));
497     PetscCall(MatShellSetMatProductOperation(pch2opus->S,MATPRODUCT_AB,NULL,MatMatMultNumeric_NS,NULL,MATDENSECUDA,MATDENSECUDA));
498   }
499   PetscCall(MatPropagateSymmetryOptions(A,pch2opus->S));
500   PetscCall(MatBindToCPU(pch2opus->S,pch2opus->boundtocpu));
501   /* XXX */
502   PetscCall(MatSetOption(pch2opus->S,MAT_SYMMETRIC,PETSC_TRUE));
503   PetscFunctionReturn(0);
504 }
505 
506 static PetscErrorCode PCSetUp_H2OPUS(PC pc)
507 {
508   PC_H2OPUS *pch2opus  = (PC_H2OPUS*)pc->data;
509   Mat        A         = pc->useAmat ? pc->mat : pc->pmat;
510   NormType   norm      = pch2opus->normtype;
511   PetscReal  initerr   = 0.0,err;
512   PetscBool  ish2opus;
513 
514   PetscFunctionBegin;
515   if (!pch2opus->T) {
516     PetscInt    M,N,m,n;
517     const char *prefix;
518 
519     PetscCall(PCGetOptionsPrefix(pc,&prefix));
520     PetscCall(MatGetSize(A,&M,&N));
521     PetscCall(MatGetLocalSize(A,&m,&n));
522     PetscCall(MatCreateShell(PetscObjectComm((PetscObject)pc->pmat),m,n,M,N,pc,&pch2opus->T));
523     PetscCall(MatSetBlockSizesFromMats(pch2opus->T,A,A));
524     PetscCall(MatShellSetOperation(pch2opus->T,MATOP_MULT,(void (*)(void))MatMult_MAmI));
525     PetscCall(MatShellSetOperation(pch2opus->T,MATOP_MULT_TRANSPOSE,(void (*)(void))MatMultTranspose_MAmI));
526     PetscCall(MatShellSetOperation(pch2opus->T,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
527 #if defined(PETSC_H2OPUS_USE_GPU)
528     PetscCall(MatShellSetVecType(pch2opus->T,VECCUDA));
529 #endif
530     PetscCall(PetscLogObjectParent((PetscObject)pc,(PetscObject)pch2opus->T));
531     PetscCall(MatSetOptionsPrefix(pch2opus->T,prefix));
532     PetscCall(MatAppendOptionsPrefix(pch2opus->T,"pc_h2opus_est_"));
533   }
534   PetscCall(MatDestroy(&pch2opus->A));
535   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
536   if (ish2opus) {
537     PetscCall(PetscObjectReference((PetscObject)A));
538     pch2opus->A = A;
539   } else {
540     const char *prefix;
541     PetscCall(MatCreateH2OpusFromMat(A,pch2opus->sdim,pch2opus->coords,PETSC_FALSE,pch2opus->eta,pch2opus->leafsize,pch2opus->max_rank,pch2opus->bs,pch2opus->mrtol,&pch2opus->A));
542     /* XXX */
543     PetscCall(MatSetOption(pch2opus->A,MAT_SYMMETRIC,PETSC_TRUE));
544     PetscCall(PCGetOptionsPrefix(pc,&prefix));
545     PetscCall(MatSetOptionsPrefix(pch2opus->A,prefix));
546     PetscCall(MatAppendOptionsPrefix(pch2opus->A,"pc_h2opus_init_"));
547     PetscCall(MatSetFromOptions(pch2opus->A));
548     PetscCall(MatAssemblyBegin(pch2opus->A,MAT_FINAL_ASSEMBLY));
549     PetscCall(MatAssemblyEnd(pch2opus->A,MAT_FINAL_ASSEMBLY));
550     /* XXX */
551     PetscCall(MatSetOption(pch2opus->A,MAT_SYMMETRIC,PETSC_TRUE));
552 
553     /* always perform construction on the GPU unless forcecpu is true */
554     PetscCall(MatBindToCPU(pch2opus->A,pch2opus->forcecpu));
555   }
556 #if defined(PETSC_H2OPUS_USE_GPU)
557   pch2opus->boundtocpu = pch2opus->forcecpu ? PETSC_TRUE : pch2opus->A->boundtocpu;
558 #endif
559   PetscCall(MatBindToCPU(pch2opus->T,pch2opus->boundtocpu));
560   if (pch2opus->M) { /* see if we can reuse M as initial guess */
561     PetscReal reuse;
562 
563     PetscCall(MatBindToCPU(pch2opus->M,pch2opus->boundtocpu));
564     PetscCall(MatNorm(pch2opus->T,norm,&reuse));
565     if (reuse >= 1.0) PetscCall(MatDestroy(&pch2opus->M));
566   }
567   if (!pch2opus->M) {
568     const char *prefix;
569     PetscCall(MatDuplicate(pch2opus->A,MAT_COPY_VALUES,&pch2opus->M));
570     PetscCall(PCGetOptionsPrefix(pc,&prefix));
571     PetscCall(MatSetOptionsPrefix(pch2opus->M,prefix));
572     PetscCall(MatAppendOptionsPrefix(pch2opus->M,"pc_h2opus_inv_"));
573     PetscCall(MatSetFromOptions(pch2opus->M));
574     PetscCall(PCH2OpusSetUpInit(pc));
575     PetscCall(MatScale(pch2opus->M,pch2opus->s0));
576   }
577   /* A and M have the same h2 matrix structure, save on reordering routines */
578   PetscCall(MatH2OpusSetNativeMult(pch2opus->A,PETSC_TRUE));
579   PetscCall(MatH2OpusSetNativeMult(pch2opus->M,PETSC_TRUE));
580   if (norm == NORM_1 || norm == NORM_2 || norm == NORM_INFINITY) PetscCall(MatNorm(pch2opus->T,norm,&initerr));
581   if (PetscIsInfOrNanReal(initerr)) pc->failedreason = PC_SETUP_ERROR;
582   err   = initerr;
583   if (pch2opus->monitor) {
584     PetscCall(PetscPrintf(PetscObjectComm((PetscObject)pc),"%" PetscInt_FMT ": ||M*A - I|| NORM%s abs %g rel %g\n",0,NormTypes[norm],(double)err,(double)(err/initerr)));
585   }
586   if (initerr > pch2opus->atol && !pc->failedreason) {
587     PetscInt i;
588 
589     PetscCall(PCH2OpusSetUpSampler_Private(pc));
590     for (i = 0; i < pch2opus->maxits; i++) {
591       Mat         M;
592       const char* prefix;
593 
594       PetscCall(MatDuplicate(pch2opus->M,MAT_SHARE_NONZERO_PATTERN,&M));
595       PetscCall(MatGetOptionsPrefix(pch2opus->M,&prefix));
596       PetscCall(MatSetOptionsPrefix(M,prefix));
597       PetscCall(MatH2OpusSetSamplingMat(M,pch2opus->S,PETSC_DECIDE,PETSC_DECIDE));
598       PetscCall(MatSetFromOptions(M));
599       PetscCall(MatH2OpusSetNativeMult(M,PETSC_TRUE));
600       PetscCall(MatAssemblyBegin(M,MAT_FINAL_ASSEMBLY));
601       PetscCall(MatAssemblyEnd(M,MAT_FINAL_ASSEMBLY));
602       PetscCall(MatDestroy(&pch2opus->M));
603       pch2opus->M = M;
604       if (norm == NORM_1 || norm == NORM_2 || norm == NORM_INFINITY) PetscCall(MatNorm(pch2opus->T,norm,&err));
605       if (pch2opus->monitor) {
606         PetscCall(PetscPrintf(PetscObjectComm((PetscObject)pc),"%" PetscInt_FMT ": ||M*A - I|| NORM%s abs %g rel %g\n",i+1,NormTypes[norm],(double)err,(double)(err/initerr)));
607       }
608       if (PetscIsInfOrNanReal(err)) pc->failedreason = PC_SETUP_ERROR;
609       if (err < pch2opus->atol || err < pch2opus->rtol*initerr || pc->failedreason) break;
610     }
611   }
612   /* cleanup setup workspace */
613   PetscCall(MatH2OpusSetNativeMult(pch2opus->A,PETSC_FALSE));
614   PetscCall(MatH2OpusSetNativeMult(pch2opus->M,PETSC_FALSE));
615   PetscCall(VecDestroy(&pch2opus->wns[0]));
616   PetscCall(VecDestroy(&pch2opus->wns[1]));
617   PetscCall(VecDestroy(&pch2opus->wns[2]));
618   PetscCall(VecDestroy(&pch2opus->wns[3]));
619   PetscCall(MatDestroy(&pch2opus->wnsmat[0]));
620   PetscCall(MatDestroy(&pch2opus->wnsmat[1]));
621   PetscCall(MatDestroy(&pch2opus->wnsmat[2]));
622   PetscCall(MatDestroy(&pch2opus->wnsmat[3]));
623   PetscFunctionReturn(0);
624 }
625 
626 static PetscErrorCode PCView_H2OPUS(PC pc, PetscViewer viewer)
627 {
628   PC_H2OPUS *pch2opus = (PC_H2OPUS*)pc->data;
629   PetscBool  isascii;
630 
631   PetscFunctionBegin;
632   PetscCall(PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii));
633   if (isascii) {
634     if (pch2opus->A && pch2opus->A != pc->mat && pch2opus->A != pc->pmat) {
635       PetscCall(PetscViewerASCIIPrintf(viewer,"Initial approximation matrix\n"));
636       PetscCall(PetscViewerASCIIPushTab(viewer));
637       PetscCall(PetscViewerPushFormat(viewer,PETSC_VIEWER_ASCII_INFO_DETAIL));
638       PetscCall(MatView(pch2opus->A,viewer));
639       PetscCall(PetscViewerPopFormat(viewer));
640       PetscCall(PetscViewerASCIIPopTab(viewer));
641     }
642     if (pch2opus->M) {
643       PetscCall(PetscViewerASCIIPrintf(viewer,"Inner matrix constructed\n"));
644       PetscCall(PetscViewerASCIIPushTab(viewer));
645       PetscCall(PetscViewerPushFormat(viewer,PETSC_VIEWER_ASCII_INFO_DETAIL));
646       PetscCall(MatView(pch2opus->M,viewer));
647       PetscCall(PetscViewerPopFormat(viewer));
648       PetscCall(PetscViewerASCIIPopTab(viewer));
649     }
650     PetscCall(PetscViewerASCIIPrintf(viewer,"Initial scaling: %g\n",pch2opus->s0));
651   }
652   PetscFunctionReturn(0);
653 }
654 
655 PETSC_EXTERN PetscErrorCode PCCreate_H2OPUS(PC pc)
656 {
657   PC_H2OPUS *pch2opus;
658 
659   PetscFunctionBegin;
660   PetscCall(PetscNewLog(pc,&pch2opus));
661   pc->data = (void*)pch2opus;
662 
663   pch2opus->atol       = 1.e-2;
664   pch2opus->rtol       = 1.e-6;
665   pch2opus->maxits     = 50;
666   pch2opus->hyperorder = 1; /* defaults to basic NewtonSchultz */
667   pch2opus->normtype   = NORM_2;
668 
669   /* these are needed when we are sampling the pmat */
670   pch2opus->eta        = PETSC_DECIDE;
671   pch2opus->leafsize   = PETSC_DECIDE;
672   pch2opus->max_rank   = PETSC_DECIDE;
673   pch2opus->bs         = PETSC_DECIDE;
674   pch2opus->mrtol      = PETSC_DECIDE;
675 #if defined(PETSC_H2OPUS_USE_GPU)
676   pch2opus->boundtocpu = PETSC_FALSE;
677 #else
678   pch2opus->boundtocpu = PETSC_TRUE;
679 #endif
680   pc->ops->destroy        = PCDestroy_H2OPUS;
681   pc->ops->setup          = PCSetUp_H2OPUS;
682   pc->ops->apply          = PCApply_H2OPUS;
683   pc->ops->matapply       = PCApplyMat_H2OPUS;
684   pc->ops->applytranspose = PCApplyTranspose_H2OPUS;
685   pc->ops->reset          = PCReset_H2OPUS;
686   pc->ops->setfromoptions = PCSetFromOptions_H2OPUS;
687   pc->ops->view           = PCView_H2OPUS;
688 
689   PetscCall(PetscObjectComposeFunction((PetscObject)pc,"PCSetCoordinates_C",PCSetCoordinates_H2OPUS));
690   PetscFunctionReturn(0);
691 }
692