xref: /petsc/src/ksp/pc/impls/h2opus/pch2opus.c (revision ccb4e88a40f0b86eaeca07ff64c64e4de2fae686)
1 #include <petsc/private/pcimpl.h>
2 #include <petsc/private/matimpl.h>
3 #include <h2opusconf.h>
4 
5 /* Use GPU only if H2OPUS is configured for GPU */
6 #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
7 #define PETSC_H2OPUS_USE_GPU
8 #endif
9 
10 typedef struct {
11   Mat         A;
12   Mat         M;
13   PetscScalar s0;
14 
15   /* sampler for Newton-Schultz */
16   Mat      S;
17   PetscInt hyperorder;
18   Vec      wns[4];
19   Mat      wnsmat[4];
20 
21   /* convergence testing */
22   Mat       T;
23   Vec       w;
24   PetscBool testMA;
25 
26   /* Support for PCSetCoordinates */
27   PetscInt  sdim;
28   PetscInt  nlocc;
29   PetscReal *coords;
30 
31   /* Newton-Schultz customization */
32   PetscInt  maxits;
33   PetscReal rtol,atol;
34   PetscBool monitor;
35   PetscBool useapproximatenorms;
36   NormType  normtype;
37 
38   /* Used when pmat != MATH2OPUS */
39   PetscReal eta;
40   PetscInt  leafsize;
41   PetscInt  max_rank;
42   PetscInt  bs;
43   PetscReal mrtol;
44 
45   PetscBool boundtocpu;
46 } PC_H2OPUS;
47 
48 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat,NormType,PetscReal*);
49 
50 static PetscErrorCode PCReset_H2OPUS(PC pc)
51 {
52   PC_H2OPUS      *pch2opus = (PC_H2OPUS*)pc->data;
53   PetscErrorCode ierr;
54 
55   PetscFunctionBegin;
56   pch2opus->sdim  = 0;
57   pch2opus->nlocc = 0;
58   ierr = PetscFree(pch2opus->coords);CHKERRQ(ierr);
59   ierr = MatDestroy(&pch2opus->A);CHKERRQ(ierr);
60   ierr = MatDestroy(&pch2opus->M);CHKERRQ(ierr);
61   ierr = MatDestroy(&pch2opus->T);CHKERRQ(ierr);
62   ierr = VecDestroy(&pch2opus->w);CHKERRQ(ierr);
63   ierr = MatDestroy(&pch2opus->S);CHKERRQ(ierr);
64   ierr = VecDestroy(&pch2opus->wns[0]);CHKERRQ(ierr);
65   ierr = VecDestroy(&pch2opus->wns[1]);CHKERRQ(ierr);
66   ierr = VecDestroy(&pch2opus->wns[2]);CHKERRQ(ierr);
67   ierr = VecDestroy(&pch2opus->wns[3]);CHKERRQ(ierr);
68   ierr = MatDestroy(&pch2opus->wnsmat[0]);CHKERRQ(ierr);
69   ierr = MatDestroy(&pch2opus->wnsmat[1]);CHKERRQ(ierr);
70   ierr = MatDestroy(&pch2opus->wnsmat[2]);CHKERRQ(ierr);
71   ierr = MatDestroy(&pch2opus->wnsmat[3]);CHKERRQ(ierr);
72   PetscFunctionReturn(0);
73 }
74 
75 static PetscErrorCode PCSetCoordinates_H2OPUS(PC pc, PetscInt sdim, PetscInt nlocc, PetscReal *coords)
76 {
77   PC_H2OPUS      *pch2opus = (PC_H2OPUS*)pc->data;
78   PetscBool      reset = PETSC_TRUE;
79   PetscErrorCode ierr;
80 
81   PetscFunctionBegin;
82   if (pch2opus->sdim && sdim == pch2opus->sdim && nlocc == pch2opus->nlocc) {
83     ierr  = PetscArraycmp(pch2opus->coords,coords,sdim*nlocc,&reset);CHKERRQ(ierr);
84     reset = (PetscBool)!reset;
85   }
86   ierr = MPIU_Allreduce(MPI_IN_PLACE,&reset,1,MPIU_BOOL,MPI_LOR,PetscObjectComm((PetscObject)pc));CHKERRMPI(ierr);
87   if (reset) {
88     ierr = PCReset_H2OPUS(pc);CHKERRQ(ierr);
89     ierr = PetscMalloc1(sdim*nlocc,&pch2opus->coords);CHKERRQ(ierr);
90     ierr = PetscArraycpy(pch2opus->coords,coords,sdim*nlocc);CHKERRQ(ierr);
91     pch2opus->sdim  = sdim;
92     pch2opus->nlocc = nlocc;
93   }
94   PetscFunctionReturn(0);
95 }
96 
97 static PetscErrorCode PCDestroy_H2OPUS(PC pc)
98 {
99   PetscErrorCode ierr;
100 
101   PetscFunctionBegin;
102   ierr = PCReset_H2OPUS(pc);CHKERRQ(ierr);
103   ierr = PetscObjectComposeFunction((PetscObject)pc,"PCSetCoordinates_C",NULL);CHKERRQ(ierr);
104   ierr = PetscFree(pc->data);CHKERRQ(ierr);
105   PetscFunctionReturn(0);
106 }
107 
108 static PetscErrorCode PCSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,PC pc)
109 {
110   PC_H2OPUS      *pch2opus = (PC_H2OPUS*)pc->data;
111   PetscErrorCode ierr;
112 
113   PetscFunctionBegin;
114   ierr = PetscOptionsHead(PetscOptionsObject,"H2OPUS options");CHKERRQ(ierr);
115   ierr = PetscOptionsInt("-pc_h2opus_maxits","Maximum number of iterations for Newton-Schultz",NULL,pch2opus->maxits,&pch2opus->maxits,NULL);CHKERRQ(ierr);
116   ierr = PetscOptionsBool("-pc_h2opus_monitor","Monitor Newton-Schultz convergence",NULL,pch2opus->monitor,&pch2opus->monitor,NULL);CHKERRQ(ierr);
117   ierr = PetscOptionsReal("-pc_h2opus_atol","Absolute tolerance",NULL,pch2opus->atol,&pch2opus->atol,NULL);CHKERRQ(ierr);
118   ierr = PetscOptionsReal("-pc_h2opus_rtol","Relative tolerance",NULL,pch2opus->rtol,&pch2opus->rtol,NULL);CHKERRQ(ierr);
119   ierr = PetscOptionsEnum("-pc_h2opus_norm_type","Norm type for convergence monitoring",NULL,NormTypes,(PetscEnum)pch2opus->normtype,(PetscEnum*)&pch2opus->normtype,NULL);CHKERRQ(ierr);
120   ierr = PetscOptionsInt("-pc_h2opus_hyperorder","Hyper power order of sampling",NULL,pch2opus->hyperorder,&pch2opus->hyperorder,NULL);CHKERRQ(ierr);
121   ierr = PetscOptionsInt("-pc_h2opus_leafsize","Leaf size when constructed from kernel",NULL,pch2opus->leafsize,&pch2opus->leafsize,NULL);CHKERRQ(ierr);
122   ierr = PetscOptionsReal("-pc_h2opus_eta","Admissibility condition tolerance",NULL,pch2opus->eta,&pch2opus->eta,NULL);CHKERRQ(ierr);
123   ierr = PetscOptionsInt("-pc_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,pch2opus->max_rank,&pch2opus->max_rank,NULL);CHKERRQ(ierr);
124   ierr = PetscOptionsInt("-pc_h2opus_samples","Number of samples to be taken concurrently when constructing from matvecs",NULL,pch2opus->bs,&pch2opus->bs,NULL);CHKERRQ(ierr);
125   ierr = PetscOptionsReal("-pc_h2opus_mrtol","Relative tolerance for construction from sampling",NULL,pch2opus->mrtol,&pch2opus->mrtol,NULL);CHKERRQ(ierr);
126   ierr = PetscOptionsTail();CHKERRQ(ierr);
127   PetscFunctionReturn(0);
128 }
129 
130 typedef struct {
131   Mat A;
132   Mat M;
133   Vec w;
134 } AAtCtx;
135 
136 static PetscErrorCode MatMult_AAt(Mat A, Vec x, Vec y)
137 {
138   AAtCtx         *aat;
139   PetscErrorCode ierr;
140 
141   PetscFunctionBegin;
142   ierr = MatShellGetContext(A,(void*)&aat);CHKERRQ(ierr);
143   /* ierr = MatMultTranspose(aat->M,x,aat->w);CHKERRQ(ierr); */
144   ierr = MatMultTranspose(aat->A,x,aat->w);CHKERRQ(ierr);
145   ierr = MatMult(aat->A,aat->w,y);CHKERRQ(ierr);
146   PetscFunctionReturn(0);
147 }
148 
149 static PetscErrorCode PCH2OpusSetUpInit(PC pc)
150 {
151   PC_H2OPUS      *pch2opus = (PC_H2OPUS*)pc->data;
152   Mat            A = pc->useAmat ? pc->mat : pc->pmat, AAt;
153   PetscInt       M,m;
154   VecType        vtype;
155   PetscReal      n;
156   AAtCtx         aat;
157   PetscErrorCode ierr;
158 
159   PetscFunctionBegin;
160   aat.A = A;
161   aat.M = pch2opus->M; /* unused so far */
162   ierr = MatCreateVecs(A,NULL,&aat.w);CHKERRQ(ierr);
163   ierr = MatGetSize(A,&M,NULL);CHKERRQ(ierr);
164   ierr = MatGetLocalSize(A,&m,NULL);CHKERRQ(ierr);
165   ierr = MatCreateShell(PetscObjectComm((PetscObject)A),m,m,M,M,&aat,&AAt);CHKERRQ(ierr);
166   ierr = MatBindToCPU(AAt,pch2opus->boundtocpu);CHKERRQ(ierr);
167   ierr = MatShellSetOperation(AAt,MATOP_MULT,(void (*)(void))MatMult_AAt);CHKERRQ(ierr);
168   ierr = MatShellSetOperation(AAt,MATOP_MULT_TRANSPOSE,(void (*)(void))MatMult_AAt);CHKERRQ(ierr);
169   ierr = MatShellSetOperation(AAt,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS);CHKERRQ(ierr);
170   ierr = MatGetVecType(A,&vtype);CHKERRQ(ierr);
171   ierr = MatShellSetVecType(AAt,vtype);CHKERRQ(ierr);
172   ierr = MatNorm(AAt,NORM_1,&n);CHKERRQ(ierr);
173   pch2opus->s0 = 1./n;
174   ierr = VecDestroy(&aat.w);CHKERRQ(ierr);
175   ierr = MatDestroy(&AAt);CHKERRQ(ierr);
176   PetscFunctionReturn(0);
177 }
178 
179 static PetscErrorCode PCApplyKernel_H2OPUS(PC pc, Vec x, Vec y, PetscBool t)
180 {
181   PC_H2OPUS      *pch2opus = (PC_H2OPUS*)pc->data;
182   PetscErrorCode ierr;
183 
184   PetscFunctionBegin;
185   if (t) {
186     ierr = MatMultTranspose(pch2opus->M,x,y);CHKERRQ(ierr);
187   } else {
188     ierr = MatMult(pch2opus->M,x,y);CHKERRQ(ierr);
189   }
190   PetscFunctionReturn(0);
191 }
192 
193 static PetscErrorCode PCApplyMatKernel_H2OPUS(PC pc, Mat X, Mat Y, PetscBool t)
194 {
195   PC_H2OPUS      *pch2opus = (PC_H2OPUS*)pc->data;
196   PetscErrorCode ierr;
197 
198   PetscFunctionBegin;
199   if (t) {
200     ierr = MatTransposeMatMult(pch2opus->M,X,MAT_REUSE_MATRIX,PETSC_DEFAULT,&Y);CHKERRQ(ierr);
201   } else {
202     ierr = MatMatMult(pch2opus->M,X,MAT_REUSE_MATRIX,PETSC_DEFAULT,&Y);CHKERRQ(ierr);
203   }
204   PetscFunctionReturn(0);
205 }
206 
207 static PetscErrorCode PCApplyMat_H2OPUS(PC pc, Mat X, Mat Y)
208 {
209   PetscErrorCode ierr;
210 
211   PetscFunctionBegin;
212   ierr = PCApplyMatKernel_H2OPUS(pc,X,Y,PETSC_FALSE);CHKERRQ(ierr);
213   PetscFunctionReturn(0);
214 }
215 
216 static PetscErrorCode PCApplyTransposeMat_H2OPUS(PC pc, Mat X, Mat Y)
217 {
218   PetscErrorCode ierr;
219 
220   PetscFunctionBegin;
221   ierr = PCApplyMatKernel_H2OPUS(pc,X,Y,PETSC_TRUE);CHKERRQ(ierr);
222   PetscFunctionReturn(0);
223 }
224 
225 static PetscErrorCode PCApply_H2OPUS(PC pc, Vec x, Vec y)
226 {
227   PetscErrorCode ierr;
228 
229   PetscFunctionBegin;
230   ierr = PCApplyKernel_H2OPUS(pc,x,y,PETSC_FALSE);CHKERRQ(ierr);
231   PetscFunctionReturn(0);
232 }
233 
234 static PetscErrorCode PCApplyTranspose_H2OPUS(PC pc, Vec x, Vec y)
235 {
236   PetscErrorCode ierr;
237 
238   PetscFunctionBegin;
239   ierr = PCApplyKernel_H2OPUS(pc,x,y,PETSC_TRUE);CHKERRQ(ierr);
240   PetscFunctionReturn(0);
241 }
242 
243 /* used to test the norm of (M^-1 A - I) */
244 static PetscErrorCode MatMultKernel_MAmI(Mat M, Vec x, Vec y, PetscBool t)
245 {
246   PC             pc;
247   Mat            A;
248   PC_H2OPUS      *pch2opus;
249   PetscErrorCode ierr;
250   PetscBool      sideleft = PETSC_TRUE;
251 
252   PetscFunctionBegin;
253   ierr = MatShellGetContext(M,(void*)&pc);CHKERRQ(ierr);
254   pch2opus = (PC_H2OPUS*)pc->data;
255   if (!pch2opus->w) {
256     ierr = MatCreateVecs(pch2opus->M,&pch2opus->w,NULL);CHKERRQ(ierr);
257   }
258   A = pch2opus->A;
259   ierr = VecBindToCPU(pch2opus->w,pch2opus->boundtocpu);CHKERRQ(ierr);
260   if (t) {
261     if (sideleft) {
262       ierr = PCApplyTranspose_H2OPUS(pc,x,pch2opus->w);CHKERRQ(ierr);
263       ierr = MatMultTranspose(A,pch2opus->w,y);CHKERRQ(ierr);
264     } else {
265       ierr = MatMultTranspose(A,x,pch2opus->w);CHKERRQ(ierr);
266       ierr = PCApplyTranspose_H2OPUS(pc,pch2opus->w,y);CHKERRQ(ierr);
267     }
268   } else {
269     if (sideleft) {
270       ierr = MatMult(A,x,pch2opus->w);CHKERRQ(ierr);
271       ierr = PCApply_H2OPUS(pc,pch2opus->w,y);CHKERRQ(ierr);
272     } else {
273       ierr = PCApply_H2OPUS(pc,x,pch2opus->w);CHKERRQ(ierr);
274       ierr = MatMult(A,pch2opus->w,y);CHKERRQ(ierr);
275     }
276   }
277   if (!pch2opus->testMA) {
278     ierr = VecAXPY(y,-1.0,x);CHKERRQ(ierr);
279   }
280   PetscFunctionReturn(0);
281 }
282 
283 static PetscErrorCode MatMult_MAmI(Mat A, Vec x, Vec y)
284 {
285   PetscErrorCode ierr;
286 
287   PetscFunctionBegin;
288   ierr = MatMultKernel_MAmI(A,x,y,PETSC_FALSE);CHKERRQ(ierr);
289   PetscFunctionReturn(0);
290 }
291 
292 static PetscErrorCode MatMultTranspose_MAmI(Mat A, Vec x, Vec y)
293 {
294   PetscErrorCode ierr;
295 
296   PetscFunctionBegin;
297   ierr = MatMultKernel_MAmI(A,x,y,PETSC_TRUE);CHKERRQ(ierr);
298   PetscFunctionReturn(0);
299 }
300 
301 /* HyperPower kernel:
302 Y = R = x
303 for i = 1 . . . l - 1 do
304   R = (I - A * Xk) * R
305   Y = Y + R
306 Y = Xk * Y
307 */
308 static PetscErrorCode MatMultKernel_Hyper(Mat M, Vec x, Vec y, PetscBool t)
309 {
310   PC             pc;
311   Mat            A;
312   PC_H2OPUS      *pch2opus;
313   PetscInt       i;
314   PetscErrorCode ierr;
315 
316   PetscFunctionBegin;
317   ierr = MatShellGetContext(M,(void*)&pc);CHKERRQ(ierr);
318   pch2opus = (PC_H2OPUS*)pc->data;
319   A = pch2opus->A;
320   ierr = MatCreateVecs(pch2opus->M,pch2opus->wns[0] ? NULL : &pch2opus->wns[0],pch2opus->wns[1] ? NULL : &pch2opus->wns[1]);CHKERRQ(ierr);
321   ierr = MatCreateVecs(pch2opus->M,pch2opus->wns[2] ? NULL : &pch2opus->wns[2],pch2opus->wns[3] ? NULL : &pch2opus->wns[3]);CHKERRQ(ierr);
322   ierr = VecBindToCPU(pch2opus->wns[0],pch2opus->boundtocpu);CHKERRQ(ierr);
323   ierr = VecBindToCPU(pch2opus->wns[1],pch2opus->boundtocpu);CHKERRQ(ierr);
324   ierr = VecBindToCPU(pch2opus->wns[2],pch2opus->boundtocpu);CHKERRQ(ierr);
325   ierr = VecBindToCPU(pch2opus->wns[3],pch2opus->boundtocpu);CHKERRQ(ierr);
326   ierr = VecCopy(x,pch2opus->wns[0]);CHKERRQ(ierr);
327   ierr = VecCopy(x,pch2opus->wns[3]);CHKERRQ(ierr);
328   if (t) {
329     for (i=0;i<pch2opus->hyperorder-1;i++) {
330       ierr = MatMultTranspose(A,pch2opus->wns[0],pch2opus->wns[1]);CHKERRQ(ierr);
331       ierr = PCApplyTranspose_H2OPUS(pc,pch2opus->wns[1],pch2opus->wns[2]);CHKERRQ(ierr);
332       ierr = VecAXPY(pch2opus->wns[0],-1.,pch2opus->wns[2]);CHKERRQ(ierr);
333       ierr = VecAXPY(pch2opus->wns[3], 1.,pch2opus->wns[0]);CHKERRQ(ierr);
334     }
335     ierr = PCApplyTranspose_H2OPUS(pc,pch2opus->wns[3],y);CHKERRQ(ierr);
336   } else {
337     for (i=0;i<pch2opus->hyperorder-1;i++) {
338       ierr = PCApply_H2OPUS(pc,pch2opus->wns[0],pch2opus->wns[1]);CHKERRQ(ierr);
339       ierr = MatMult(A,pch2opus->wns[1],pch2opus->wns[2]);CHKERRQ(ierr);
340       ierr = VecAXPY(pch2opus->wns[0],-1.,pch2opus->wns[2]);CHKERRQ(ierr);
341       ierr = VecAXPY(pch2opus->wns[3], 1.,pch2opus->wns[0]);CHKERRQ(ierr);
342     }
343     ierr = PCApply_H2OPUS(pc,pch2opus->wns[3],y);CHKERRQ(ierr);
344   }
345   PetscFunctionReturn(0);
346 }
347 
348 static PetscErrorCode MatMult_Hyper(Mat M, Vec x, Vec y)
349 {
350   PetscErrorCode ierr;
351 
352   PetscFunctionBegin;
353   ierr = MatMultKernel_Hyper(M,x,y,PETSC_FALSE);CHKERRQ(ierr);
354   PetscFunctionReturn(0);
355 }
356 
357 static PetscErrorCode MatMultTranspose_Hyper(Mat M, Vec x, Vec y)
358 {
359   PetscErrorCode ierr;
360 
361   PetscFunctionBegin;
362   ierr = MatMultKernel_Hyper(M,x,y,PETSC_TRUE);CHKERRQ(ierr);
363   PetscFunctionReturn(0);
364 }
365 
366 /* Hyper power kernel, MatMat version */
367 static PetscErrorCode MatMatMultKernel_Hyper(Mat M, Mat X, Mat Y, PetscBool t)
368 {
369   PC             pc;
370   Mat            A;
371   PC_H2OPUS      *pch2opus;
372   PetscInt       i;
373   PetscErrorCode ierr;
374 
375   PetscFunctionBegin;
376   ierr = MatShellGetContext(M,(void*)&pc);CHKERRQ(ierr);
377   pch2opus = (PC_H2OPUS*)pc->data;
378   A = pch2opus->A;
379   if (pch2opus->wnsmat[0] && pch2opus->wnsmat[0]->cmap->N != X->cmap->N) {
380     ierr = MatDestroy(&pch2opus->wnsmat[0]);CHKERRQ(ierr);
381     ierr = MatDestroy(&pch2opus->wnsmat[1]);CHKERRQ(ierr);
382   }
383   if (!pch2opus->wnsmat[0]) {
384     ierr = MatDuplicate(X,MAT_SHARE_NONZERO_PATTERN,&pch2opus->wnsmat[0]);CHKERRQ(ierr);
385     ierr = MatDuplicate(Y,MAT_SHARE_NONZERO_PATTERN,&pch2opus->wnsmat[1]);CHKERRQ(ierr);
386   }
387   if (pch2opus->wnsmat[2] && pch2opus->wnsmat[2]->cmap->N != X->cmap->N) {
388     ierr = MatDestroy(&pch2opus->wnsmat[2]);CHKERRQ(ierr);
389     ierr = MatDestroy(&pch2opus->wnsmat[3]);CHKERRQ(ierr);
390   }
391   if (!pch2opus->wnsmat[2]) {
392     ierr = MatDuplicate(X,MAT_SHARE_NONZERO_PATTERN,&pch2opus->wnsmat[2]);CHKERRQ(ierr);
393     ierr = MatDuplicate(Y,MAT_SHARE_NONZERO_PATTERN,&pch2opus->wnsmat[3]);CHKERRQ(ierr);
394   }
395   ierr = MatCopy(X,pch2opus->wnsmat[0],SAME_NONZERO_PATTERN);CHKERRQ(ierr);
396   ierr = MatCopy(X,pch2opus->wnsmat[3],SAME_NONZERO_PATTERN);CHKERRQ(ierr);
397   if (t) {
398     for (i=0;i<pch2opus->hyperorder-1;i++) {
399       ierr = MatTransposeMatMult(A,pch2opus->wnsmat[0],MAT_REUSE_MATRIX,PETSC_DEFAULT,&pch2opus->wnsmat[1]);CHKERRQ(ierr);
400       ierr = PCApplyTransposeMat_H2OPUS(pc,pch2opus->wnsmat[1],pch2opus->wnsmat[2]);CHKERRQ(ierr);
401       ierr = MatAXPY(pch2opus->wnsmat[0],-1.,pch2opus->wnsmat[2],SAME_NONZERO_PATTERN);CHKERRQ(ierr);
402       ierr = MatAXPY(pch2opus->wnsmat[3],1.,pch2opus->wnsmat[0],SAME_NONZERO_PATTERN);CHKERRQ(ierr);
403     }
404     ierr = PCApplyTransposeMat_H2OPUS(pc,pch2opus->wnsmat[3],Y);CHKERRQ(ierr);
405   } else {
406     for (i=0;i<pch2opus->hyperorder-1;i++) {
407       ierr = PCApplyMat_H2OPUS(pc,pch2opus->wnsmat[0],pch2opus->wnsmat[1]);CHKERRQ(ierr);
408       ierr = MatMatMult(A,pch2opus->wnsmat[1],MAT_REUSE_MATRIX,PETSC_DEFAULT,&pch2opus->wnsmat[2]);CHKERRQ(ierr);
409       ierr = MatAXPY(pch2opus->wnsmat[0],-1.,pch2opus->wnsmat[2],SAME_NONZERO_PATTERN);CHKERRQ(ierr);
410       ierr = MatAXPY(pch2opus->wnsmat[3],1.,pch2opus->wnsmat[0],SAME_NONZERO_PATTERN);CHKERRQ(ierr);
411     }
412     ierr = PCApplyMat_H2OPUS(pc,pch2opus->wnsmat[3],Y);CHKERRQ(ierr);
413   }
414   PetscFunctionReturn(0);
415 }
416 
417 static PetscErrorCode MatMatMultNumeric_Hyper(Mat M, Mat X, Mat Y,void *ctx)
418 {
419   PetscErrorCode ierr;
420 
421   PetscFunctionBegin;
422   ierr = MatMatMultKernel_Hyper(M,X,Y,PETSC_FALSE);CHKERRQ(ierr);
423   PetscFunctionReturn(0);
424 }
425 
426 /* Basic Newton-Schultz sampler: (2 * I - M * A)*M */
427 static PetscErrorCode MatMultKernel_NS(Mat M, Vec x, Vec y, PetscBool t)
428 {
429   PC             pc;
430   Mat            A;
431   PC_H2OPUS      *pch2opus;
432   PetscErrorCode ierr;
433 
434   PetscFunctionBegin;
435   ierr = MatShellGetContext(M,(void*)&pc);CHKERRQ(ierr);
436   pch2opus = (PC_H2OPUS*)pc->data;
437   A = pch2opus->A;
438   ierr = MatCreateVecs(pch2opus->M,pch2opus->wns[0] ? NULL : &pch2opus->wns[0],pch2opus->wns[1] ? NULL : &pch2opus->wns[1]);CHKERRQ(ierr);
439   ierr = VecBindToCPU(pch2opus->wns[0],pch2opus->boundtocpu);CHKERRQ(ierr);
440   ierr = VecBindToCPU(pch2opus->wns[1],pch2opus->boundtocpu);CHKERRQ(ierr);
441   if (t) {
442     ierr = PCApplyTranspose_H2OPUS(pc,x,y);CHKERRQ(ierr);
443     ierr = MatMultTranspose(A,y,pch2opus->wns[1]);CHKERRQ(ierr);
444     ierr = PCApplyTranspose_H2OPUS(pc,pch2opus->wns[1],pch2opus->wns[0]);CHKERRQ(ierr);
445     ierr = VecAXPBY(y,-1.,2.,pch2opus->wns[0]);CHKERRQ(ierr);
446   } else {
447     ierr = PCApply_H2OPUS(pc,x,y);CHKERRQ(ierr);
448     ierr = MatMult(A,y,pch2opus->wns[0]);CHKERRQ(ierr);
449     ierr = PCApply_H2OPUS(pc,pch2opus->wns[0],pch2opus->wns[1]);CHKERRQ(ierr);
450     ierr = VecAXPBY(y,-1.,2.,pch2opus->wns[1]);CHKERRQ(ierr);
451   }
452   PetscFunctionReturn(0);
453 }
454 
455 static PetscErrorCode MatMult_NS(Mat M, Vec x, Vec y)
456 {
457   PetscErrorCode ierr;
458 
459   PetscFunctionBegin;
460   ierr = MatMultKernel_NS(M,x,y,PETSC_FALSE);CHKERRQ(ierr);
461   PetscFunctionReturn(0);
462 }
463 
464 static PetscErrorCode MatMultTranspose_NS(Mat M, Vec x, Vec y)
465 {
466   PetscErrorCode ierr;
467 
468   PetscFunctionBegin;
469   ierr = MatMultKernel_NS(M,x,y,PETSC_TRUE);CHKERRQ(ierr);
470   PetscFunctionReturn(0);
471 }
472 
473 /* Basic Newton-Schultz sampler: (2 * I - M * A)*M, MatMat version */
474 static PetscErrorCode MatMatMultKernel_NS(Mat M, Mat X, Mat Y, PetscBool t)
475 {
476   PC             pc;
477   Mat            A;
478   PC_H2OPUS      *pch2opus;
479   PetscErrorCode ierr;
480 
481   PetscFunctionBegin;
482   ierr = MatShellGetContext(M,(void*)&pc);CHKERRQ(ierr);
483   pch2opus = (PC_H2OPUS*)pc->data;
484   A = pch2opus->A;
485   if (pch2opus->wnsmat[0] && pch2opus->wnsmat[0]->cmap->N != X->cmap->N) {
486     ierr = MatDestroy(&pch2opus->wnsmat[0]);CHKERRQ(ierr);
487     ierr = MatDestroy(&pch2opus->wnsmat[1]);CHKERRQ(ierr);
488   }
489   if (!pch2opus->wnsmat[0]) {
490     ierr = MatDuplicate(X,MAT_SHARE_NONZERO_PATTERN,&pch2opus->wnsmat[0]);CHKERRQ(ierr);
491     ierr = MatDuplicate(Y,MAT_SHARE_NONZERO_PATTERN,&pch2opus->wnsmat[1]);CHKERRQ(ierr);
492   }
493   if (t) {
494     ierr = PCApplyTransposeMat_H2OPUS(pc,X,Y);CHKERRQ(ierr);
495     ierr = MatTransposeMatMult(A,Y,MAT_REUSE_MATRIX,PETSC_DEFAULT,&pch2opus->wnsmat[1]);CHKERRQ(ierr);
496     ierr = PCApplyTransposeMat_H2OPUS(pc,pch2opus->wnsmat[1],pch2opus->wnsmat[0]);CHKERRQ(ierr);
497     ierr = MatScale(Y,2.);CHKERRQ(ierr);
498     ierr = MatAXPY(Y,-1.,pch2opus->wnsmat[0],SAME_NONZERO_PATTERN);CHKERRQ(ierr);
499   } else {
500     ierr = PCApplyMat_H2OPUS(pc,X,Y);CHKERRQ(ierr);
501     ierr = MatMatMult(A,Y,MAT_REUSE_MATRIX,PETSC_DEFAULT,&pch2opus->wnsmat[0]);CHKERRQ(ierr);
502     ierr = PCApplyMat_H2OPUS(pc,pch2opus->wnsmat[0],pch2opus->wnsmat[1]);CHKERRQ(ierr);
503     ierr = MatScale(Y,2.);CHKERRQ(ierr);
504     ierr = MatAXPY(Y,-1.,pch2opus->wnsmat[1],SAME_NONZERO_PATTERN);CHKERRQ(ierr);
505   }
506   PetscFunctionReturn(0);
507 }
508 
509 static PetscErrorCode MatMatMultNumeric_NS(Mat M, Mat X, Mat Y, void *ctx)
510 {
511   PetscErrorCode ierr;
512 
513   PetscFunctionBegin;
514   ierr = MatMatMultKernel_NS(M,X,Y,PETSC_FALSE);CHKERRQ(ierr);
515   PetscFunctionReturn(0);
516 }
517 
518 static PetscErrorCode PCH2OpusSetUpSampler_Private(PC pc)
519 {
520   PC_H2OPUS      *pch2opus = (PC_H2OPUS*)pc->data;
521   Mat            A = pc->useAmat ? pc->mat : pc->pmat;
522   PetscErrorCode ierr;
523 
524   PetscFunctionBegin;
525   if (!pch2opus->S) {
526     PetscInt M,N,m,n;
527 
528     ierr = MatGetSize(A,&M,&N);CHKERRQ(ierr);
529     ierr = MatGetLocalSize(A,&m,&n);CHKERRQ(ierr);
530     ierr = MatCreateShell(PetscObjectComm((PetscObject)A),m,n,M,N,pc,&pch2opus->S);CHKERRQ(ierr);
531     ierr = MatSetBlockSizesFromMats(pch2opus->S,A,A);CHKERRQ(ierr);
532 #if defined(PETSC_H2OPUS_USE_GPU)
533     ierr = MatShellSetVecType(pch2opus->S,VECCUDA);CHKERRQ(ierr);
534 #endif
535   }
536   if (pch2opus->hyperorder >= 2) {
537     ierr = MatShellSetOperation(pch2opus->S,MATOP_MULT,(void (*)(void))MatMult_Hyper);CHKERRQ(ierr);
538     ierr = MatShellSetOperation(pch2opus->S,MATOP_MULT_TRANSPOSE,(void (*)(void))MatMultTranspose_Hyper);CHKERRQ(ierr);
539     ierr = MatShellSetMatProductOperation(pch2opus->S,MATPRODUCT_AB,NULL,MatMatMultNumeric_Hyper,NULL,MATDENSE,MATDENSE);CHKERRQ(ierr);
540     ierr = MatShellSetMatProductOperation(pch2opus->S,MATPRODUCT_AB,NULL,MatMatMultNumeric_Hyper,NULL,MATDENSECUDA,MATDENSECUDA);CHKERRQ(ierr);
541   } else {
542     ierr = MatShellSetOperation(pch2opus->S,MATOP_MULT,(void (*)(void))MatMult_NS);CHKERRQ(ierr);
543     ierr = MatShellSetOperation(pch2opus->S,MATOP_MULT_TRANSPOSE,(void (*)(void))MatMultTranspose_NS);CHKERRQ(ierr);
544     ierr = MatShellSetMatProductOperation(pch2opus->S,MATPRODUCT_AB,NULL,MatMatMultNumeric_NS,NULL,MATDENSE,MATDENSE);CHKERRQ(ierr);
545     ierr = MatShellSetMatProductOperation(pch2opus->S,MATPRODUCT_AB,NULL,MatMatMultNumeric_NS,NULL,MATDENSECUDA,MATDENSECUDA);CHKERRQ(ierr);
546   }
547   ierr = MatPropagateSymmetryOptions(A,pch2opus->S);CHKERRQ(ierr);
548   ierr = MatBindToCPU(pch2opus->S,pch2opus->boundtocpu);CHKERRQ(ierr);
549   /* XXX */
550   ierr = MatSetOption(pch2opus->S,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
551   PetscFunctionReturn(0);
552 }
553 
554 static PetscErrorCode PCSetUp_H2OPUS(PC pc)
555 {
556   PC_H2OPUS      *pch2opus = (PC_H2OPUS*)pc->data;
557   Mat            A = pc->useAmat ? pc->mat : pc->pmat;
558   NormType       norm = pch2opus->normtype;
559   PetscReal      initerr = 0.0,err;
560   PetscReal      initerrMA = 0.0,errMA;
561   PetscBool      ish2opus;
562   PetscErrorCode ierr;
563 
564   PetscFunctionBegin;
565   if (!pch2opus->T) {
566     PetscInt    M,N,m,n;
567     const char *prefix;
568 
569     ierr = PCGetOptionsPrefix(pc,&prefix);CHKERRQ(ierr);
570     ierr = MatGetSize(A,&M,&N);CHKERRQ(ierr);
571     ierr = MatGetLocalSize(A,&m,&n);CHKERRQ(ierr);
572     ierr = MatCreateShell(PetscObjectComm((PetscObject)pc->pmat),m,n,M,N,pc,&pch2opus->T);CHKERRQ(ierr);
573     ierr = MatSetBlockSizesFromMats(pch2opus->T,A,A);CHKERRQ(ierr);
574     ierr = MatShellSetOperation(pch2opus->T,MATOP_MULT,(void (*)(void))MatMult_MAmI);CHKERRQ(ierr);
575     ierr = MatShellSetOperation(pch2opus->T,MATOP_MULT_TRANSPOSE,(void (*)(void))MatMultTranspose_MAmI);CHKERRQ(ierr);
576     ierr = MatShellSetOperation(pch2opus->T,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS);CHKERRQ(ierr);
577 #if defined(PETSC_H2OPUS_USE_GPU)
578     ierr = MatShellSetVecType(pch2opus->T,VECCUDA);CHKERRQ(ierr);
579 #endif
580     ierr = PetscLogObjectParent((PetscObject)pc,(PetscObject)pch2opus->T);CHKERRQ(ierr);
581     ierr = MatSetOptionsPrefix(pch2opus->T,prefix);CHKERRQ(ierr);
582     ierr = MatAppendOptionsPrefix(pch2opus->T,"pc_h2opus_est_");CHKERRQ(ierr);
583   }
584   ierr = MatDestroy(&pch2opus->A);CHKERRQ(ierr);
585   ierr = PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus);CHKERRQ(ierr);
586   if (ish2opus) {
587     ierr = PetscObjectReference((PetscObject)A);CHKERRQ(ierr);
588     pch2opus->A = A;
589   } else {
590     const char *prefix;
591     ierr = MatCreateH2OpusFromMat(A,pch2opus->sdim,pch2opus->coords,PETSC_FALSE,pch2opus->eta,pch2opus->leafsize,pch2opus->max_rank,pch2opus->bs,pch2opus->mrtol,&pch2opus->A);CHKERRQ(ierr);
592     /* XXX */
593     ierr = MatSetOption(pch2opus->A,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
594     ierr = PCGetOptionsPrefix(pc,&prefix);CHKERRQ(ierr);
595     ierr = MatSetOptionsPrefix(pch2opus->A,prefix);CHKERRQ(ierr);
596     ierr = MatAppendOptionsPrefix(pch2opus->A,"pc_h2opus_init_");CHKERRQ(ierr);
597     ierr = MatSetFromOptions(pch2opus->A);CHKERRQ(ierr);
598     ierr = MatAssemblyBegin(pch2opus->A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
599     ierr = MatAssemblyEnd(pch2opus->A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
600     /* XXX */
601     ierr = MatSetOption(pch2opus->A,MAT_SYMMETRIC,PETSC_TRUE);CHKERRQ(ierr);
602   }
603 #if defined(PETSC_H2OPUS_USE_GPU)
604   pch2opus->boundtocpu = pch2opus->A->boundtocpu;
605 #endif
606   ierr = MatBindToCPU(pch2opus->T,pch2opus->boundtocpu);CHKERRQ(ierr);
607   if (pch2opus->M) { /* see if we can reuse M as initial guess */
608     PetscReal reuse;
609 
610     ierr = MatBindToCPU(pch2opus->M,pch2opus->boundtocpu);CHKERRQ(ierr);
611     ierr = MatNorm(pch2opus->T,norm,&reuse);CHKERRQ(ierr);
612     if (reuse >= 1.0) {
613       ierr = MatDestroy(&pch2opus->M);CHKERRQ(ierr);
614     }
615   }
616   if (!pch2opus->M) {
617     const char *prefix;
618     ierr = MatDuplicate(pch2opus->A,MAT_COPY_VALUES,&pch2opus->M);CHKERRQ(ierr);
619     ierr = PCGetOptionsPrefix(pc,&prefix);CHKERRQ(ierr);
620     ierr = MatSetOptionsPrefix(pch2opus->M,prefix);CHKERRQ(ierr);
621     ierr = MatAppendOptionsPrefix(pch2opus->M,"pc_h2opus_inv_");CHKERRQ(ierr);
622     ierr = MatSetFromOptions(pch2opus->M);CHKERRQ(ierr);
623     ierr = PCH2OpusSetUpInit(pc);CHKERRQ(ierr);
624     ierr = MatScale(pch2opus->M,pch2opus->s0);CHKERRQ(ierr);
625   }
626   /* A and M have the same h2 matrix structure, save on reordering routines */
627   ierr = MatH2OpusSetNativeMult(pch2opus->A,PETSC_TRUE);CHKERRQ(ierr);
628   ierr = MatH2OpusSetNativeMult(pch2opus->M,PETSC_TRUE);CHKERRQ(ierr);
629   if (norm == NORM_1 || norm == NORM_2 || norm == NORM_INFINITY) {
630     ierr = MatNorm(pch2opus->T,norm,&initerr);CHKERRQ(ierr);
631     pch2opus->testMA = PETSC_TRUE;
632     ierr = MatNorm(pch2opus->T,norm,&initerrMA);CHKERRQ(ierr);
633     pch2opus->testMA = PETSC_FALSE;
634   }
635   if (PetscIsInfOrNanReal(initerr)) pc->failedreason = PC_SETUP_ERROR;
636   err   = initerr;
637   errMA = initerrMA;
638   if (pch2opus->monitor) {
639     ierr = PetscPrintf(PetscObjectComm((PetscObject)pc),"%D: ||M*A - I|| NORM%s abs %g rel %g\n",0,NormTypes[norm],(double)err,(double)(err/initerr));CHKERRQ(ierr);
640     ierr = PetscPrintf(PetscObjectComm((PetscObject)pc),"%D: ||M*A||     NORM%s abs %g rel %g\n",0,NormTypes[norm],(double)errMA,(double)(errMA/initerrMA));CHKERRQ(ierr);
641   }
642   if (initerr > pch2opus->atol && !pc->failedreason) {
643     PetscInt i;
644 
645     ierr = PCH2OpusSetUpSampler_Private(pc);CHKERRQ(ierr);
646     for (i = 0; i < pch2opus->maxits; i++) {
647       Mat         M;
648       const char* prefix;
649 
650       ierr = MatDuplicate(pch2opus->M,MAT_SHARE_NONZERO_PATTERN,&M);CHKERRQ(ierr);
651       ierr = MatGetOptionsPrefix(pch2opus->M,&prefix);CHKERRQ(ierr);
652       ierr = MatSetOptionsPrefix(M,prefix);CHKERRQ(ierr);
653       ierr = MatH2OpusSetSamplingMat(M,pch2opus->S,PETSC_DECIDE,PETSC_DECIDE);CHKERRQ(ierr);
654       ierr = MatSetFromOptions(M);CHKERRQ(ierr);
655       ierr = MatH2OpusSetNativeMult(M,PETSC_TRUE);CHKERRQ(ierr);
656       ierr = MatAssemblyBegin(M,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
657       ierr = MatAssemblyEnd(M,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
658 
659       ierr = MatDestroy(&pch2opus->M);CHKERRQ(ierr);
660       pch2opus->M = M;
661       if (norm == NORM_1 || norm == NORM_2 || norm == NORM_INFINITY) {
662         ierr = MatNorm(pch2opus->T,norm,&err);CHKERRQ(ierr);
663         pch2opus->testMA = PETSC_TRUE;
664         ierr = MatNorm(pch2opus->T,norm,&errMA);CHKERRQ(ierr);
665         pch2opus->testMA = PETSC_FALSE;
666       }
667       if (pch2opus->monitor) {
668         ierr = PetscPrintf(PetscObjectComm((PetscObject)pc),"%D: ||M*A - I|| NORM%s abs %g rel %g\n",i+1,NormTypes[norm],(double)err,(double)(err/initerr));CHKERRQ(ierr);
669         ierr = PetscPrintf(PetscObjectComm((PetscObject)pc),"%D: ||M*A||     NORM%s abs %g rel %g\n",i+1,NormTypes[norm],(double)errMA,(double)(errMA/initerrMA));CHKERRQ(ierr);
670       }
671       if (PetscIsInfOrNanReal(err)) pc->failedreason = PC_SETUP_ERROR;
672       if (err < pch2opus->atol || err < pch2opus->rtol*initerr || pc->failedreason) break;
673     }
674   }
675   /* cleanup setup workspace */
676   ierr = MatH2OpusSetNativeMult(pch2opus->A,PETSC_FALSE);CHKERRQ(ierr);
677   ierr = MatH2OpusSetNativeMult(pch2opus->M,PETSC_FALSE);CHKERRQ(ierr);
678   ierr = VecDestroy(&pch2opus->wns[0]);CHKERRQ(ierr);
679   ierr = VecDestroy(&pch2opus->wns[1]);CHKERRQ(ierr);
680   ierr = VecDestroy(&pch2opus->wns[2]);CHKERRQ(ierr);
681   ierr = VecDestroy(&pch2opus->wns[3]);CHKERRQ(ierr);
682   ierr = MatDestroy(&pch2opus->wnsmat[0]);CHKERRQ(ierr);
683   ierr = MatDestroy(&pch2opus->wnsmat[1]);CHKERRQ(ierr);
684   ierr = MatDestroy(&pch2opus->wnsmat[2]);CHKERRQ(ierr);
685   ierr = MatDestroy(&pch2opus->wnsmat[3]);CHKERRQ(ierr);
686   PetscFunctionReturn(0);
687 }
688 
689 static PetscErrorCode PCView_H2OPUS(PC pc, PetscViewer viewer)
690 {
691   PetscErrorCode ierr;
692   PC_H2OPUS      *pch2opus = (PC_H2OPUS*)pc->data;
693   PetscBool      isascii;
694 
695   PetscFunctionBegin;
696   ierr = PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);CHKERRQ(ierr);
697   if (isascii) {
698     if (pch2opus->A && pch2opus->A != pc->mat && pch2opus->A != pc->pmat) {
699       ierr = PetscViewerASCIIPrintf(viewer,"Initial approximation matrix\n");CHKERRQ(ierr);
700       ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
701       ierr = PetscViewerPushFormat(viewer,PETSC_VIEWER_ASCII_INFO_DETAIL);CHKERRQ(ierr);
702       ierr = MatView(pch2opus->A,viewer);CHKERRQ(ierr);
703       ierr = PetscViewerPopFormat(viewer);CHKERRQ(ierr);
704       ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
705     }
706     if (pch2opus->M) {
707       ierr = PetscViewerASCIIPrintf(viewer,"Inner matrix constructed\n");CHKERRQ(ierr);
708       ierr = PetscViewerASCIIPushTab(viewer);CHKERRQ(ierr);
709       ierr = PetscViewerPushFormat(viewer,PETSC_VIEWER_ASCII_INFO_DETAIL);CHKERRQ(ierr);
710       ierr = MatView(pch2opus->M,viewer);CHKERRQ(ierr);
711       ierr = PetscViewerPopFormat(viewer);CHKERRQ(ierr);
712       ierr = PetscViewerASCIIPopTab(viewer);CHKERRQ(ierr);
713     }
714     ierr = PetscViewerASCIIPrintf(viewer,"Initial scaling: %g\n",pch2opus->s0);CHKERRQ(ierr);
715   }
716   PetscFunctionReturn(0);
717 }
718 
719 PETSC_EXTERN PetscErrorCode PCCreate_H2OPUS(PC pc)
720 {
721   PetscErrorCode ierr;
722   PC_H2OPUS      *pch2opus;
723 
724   PetscFunctionBegin;
725   ierr     = PetscNewLog(pc,&pch2opus);CHKERRQ(ierr);
726   pc->data = (void*)pch2opus;
727 
728   pch2opus->atol       = 1.e-2;
729   pch2opus->rtol       = 1.e-6;
730   pch2opus->maxits     = 50;
731   pch2opus->hyperorder = 1; /* defaults to basic NewtonSchultz */
732   pch2opus->normtype   = NORM_2;
733 
734   /* these are needed when we are sampling the pmat */
735   pch2opus->eta        = PETSC_DECIDE;
736   pch2opus->leafsize   = PETSC_DECIDE;
737   pch2opus->max_rank   = PETSC_DECIDE;
738   pch2opus->bs         = PETSC_DECIDE;
739   pch2opus->mrtol      = PETSC_DECIDE;
740 #if defined(PETSC_H2OPUS_USE_GPU)
741   pch2opus->boundtocpu = PETSC_FALSE;
742 #else
743   pch2opus->boundtocpu = PETSC_TRUE;
744 #endif
745   pc->ops->destroy        = PCDestroy_H2OPUS;
746   pc->ops->setup          = PCSetUp_H2OPUS;
747   pc->ops->apply          = PCApply_H2OPUS;
748   pc->ops->matapply       = PCApplyMat_H2OPUS;
749   pc->ops->applytranspose = PCApplyTranspose_H2OPUS;
750   pc->ops->reset          = PCReset_H2OPUS;
751   pc->ops->setfromoptions = PCSetFromOptions_H2OPUS;
752   pc->ops->view           = PCView_H2OPUS;
753 
754   ierr = PetscObjectComposeFunction((PetscObject)pc,"PCSetCoordinates_C",PCSetCoordinates_H2OPUS);CHKERRQ(ierr);
755   PetscFunctionReturn(0);
756 }
757