xref: /petsc/src/mat/impls/h2opus/cuda/math2opus.cu (revision 4ffacfe27a72f4cdf51b68a3bbb6aed96040fb2f)
1 #include <h2opusconf.h>
2 /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
3 #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU)
4 #include <h2opus.h>
5 #if defined(H2OPUS_USE_MPI)
6 #include <h2opus/distributed/distributed_h2opus_handle.h>
7 #include <h2opus/distributed/distributed_geometric_construction.h>
8 #include <h2opus/distributed/distributed_hgemv.h>
9 #include <h2opus/distributed/distributed_horthog.h>
10 #include <h2opus/distributed/distributed_hcompress.h>
11 #endif
12 #include <h2opus/util/boxentrygen.h>
13 #include <petsc/private/matimpl.h>
14 #include <petsc/private/vecimpl.h>
15 #include <petsc/private/deviceimpl.h>
16 #include <petscsf.h>
17 
18 /* math2opusutils */
19 PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*);
20 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat,PetscSF,PetscSF*);
21 PETSC_INTERN PetscErrorCode VecSign(Vec,Vec);
22 PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt);
23 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*);
24 
25 #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
26 
27 /* Use GPU only if H2OPUS is configured for GPU */
28 #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
29 #define PETSC_H2OPUS_USE_GPU
30 #endif
31 #if defined(PETSC_H2OPUS_USE_GPU)
32 #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B)))
33 #else
34 #define MatH2OpusUpdateIfNeeded(A,B) 0
35 #endif
36 
37 // TODO H2OPUS:
38 // DistributedHMatrix
39 //   unsymmetric ?
40 //   transpose for distributed_hgemv?
41 //   clearData()
42 // Unify interface for sequential and parallel?
43 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
44 //
45 template <class T> class PetscPointCloud : public H2OpusDataSet<T>
46 {
47   private:
48     int dimension;
49     size_t num_points;
50     std::vector<T> pts;
51 
52   public:
53     PetscPointCloud(int dim, size_t num_pts, const T coords[])
54     {
55       dim = dim > 0 ? dim : 1;
56       this->dimension = dim;
57       this->num_points = num_pts;
58 
59       pts.resize(num_pts*dim);
60       if (coords) {
61         for (size_t n = 0; n < num_pts; n++)
62           for (int i = 0; i < dim; i++)
63             pts[n*dim + i] = coords[n*dim + i];
64       } else {
65         PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
66         for (size_t n = 0; n < num_pts; n++) {
67           pts[n*dim] = n*h;
68           for (int i = 1; i < dim; i++)
69             pts[n*dim + i] = 0.0;
70         }
71       }
72     }
73 
74     PetscPointCloud(const PetscPointCloud<T>& other)
75     {
76       size_t N = other.dimension * other.num_points;
77       this->dimension = other.dimension;
78       this->num_points = other.num_points;
79       this->pts.resize(N);
80       for (size_t i = 0; i < N; i++)
81         this->pts[i] = other.pts[i];
82     }
83 
84     int getDimension() const
85     {
86         return dimension;
87     }
88 
89     size_t getDataSetSize() const
90     {
91         return num_points;
92     }
93 
94     T getDataPoint(size_t idx, int dim) const
95     {
96         assert(dim < dimension && idx < num_points);
97         return pts[idx*dimension + dim];
98     }
99 
100     void Print(std::ostream& out = std::cout)
101     {
102        out << "Dimension: " << dimension << std::endl;
103        out << "NumPoints: " << num_points << std::endl;
104        for (size_t n = 0; n < num_points; n++) {
105          for (int d = 0; d < dimension; d++)
106            out << pts[n*dimension + d] << " ";
107          out << std::endl;
108        }
109     }
110 };
111 
112 template<class T> class PetscFunctionGenerator
113 {
114 private:
115   MatH2OpusKernel k;
116   int             dim;
117   void            *ctx;
118 
119 public:
120     PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; }
121     PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; }
122     T operator()(PetscReal *pt1, PetscReal *pt2)
123     {
124         return (T)((*this->k)(this->dim,pt1,pt2,this->ctx));
125     }
126 };
127 
128 #include <../src/mat/impls/h2opus/math2opussampler.hpp>
129 
130 /* just to not clutter the code */
131 #if !defined(H2OPUS_USE_GPU)
132 typedef HMatrix HMatrix_GPU;
133 #if defined(H2OPUS_USE_MPI)
134 typedef DistributedHMatrix DistributedHMatrix_GPU;
135 #endif
136 #endif
137 
138 typedef struct {
139 #if defined(H2OPUS_USE_MPI)
140   distributedH2OpusHandle_t handle;
141 #else
142   h2opusHandle_t handle;
143 #endif
144   /* Sequential and parallel matrices are two different classes at the moment */
145   HMatrix *hmatrix;
146 #if defined(H2OPUS_USE_MPI)
147   DistributedHMatrix *dist_hmatrix;
148 #else
149   HMatrix *dist_hmatrix; /* just to not clutter the code */
150 #endif
151   /* May use permutations */
152   PetscSF sf;
153   PetscLayout h2opus_rmap, h2opus_cmap;
154   IS h2opus_indexmap;
155   thrust::host_vector<PetscScalar> *xx,*yy;
156   PetscInt xxs,yys;
157   PetscBool multsetup;
158 
159   /* GPU */
160   HMatrix_GPU *hmatrix_gpu;
161 #if defined(H2OPUS_USE_MPI)
162   DistributedHMatrix_GPU *dist_hmatrix_gpu;
163 #else
164   HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
165 #endif
166 #if defined(PETSC_H2OPUS_USE_GPU)
167   thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
168   PetscInt xxs_gpu,yys_gpu;
169 #endif
170 
171   /* construction from matvecs */
172   PetscMatrixSampler* sampler;
173   PetscBool nativemult;
174 
175   /* Admissibility */
176   PetscReal eta;
177   PetscInt  leafsize;
178 
179   /* for dof reordering */
180   PetscPointCloud<PetscReal> *ptcloud;
181 
182   /* kernel for generating matrix entries */
183   PetscFunctionGenerator<PetscScalar> *kernel;
184 
185   /* basis orthogonalized? */
186   PetscBool orthogonal;
187 
188   /* customization */
189   PetscInt  basisord;
190   PetscInt  max_rank;
191   PetscInt  bs;
192   PetscReal rtol;
193   PetscInt  norm_max_samples;
194   PetscBool check_construction;
195   PetscBool hara_verbose;
196   PetscBool resize;
197 
198   /* keeps track of MatScale values */
199   PetscScalar s;
200 } Mat_H2OPUS;
201 
202 static PetscErrorCode MatDestroy_H2OPUS(Mat A)
203 {
204   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
205 
206   PetscFunctionBegin;
207 #if defined(H2OPUS_USE_MPI)
208   h2opusDestroyDistributedHandle(a->handle);
209 #else
210   h2opusDestroyHandle(a->handle);
211 #endif
212   delete a->dist_hmatrix;
213   delete a->hmatrix;
214   PetscCall(PetscSFDestroy(&a->sf));
215   PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
216   PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
217   PetscCall(ISDestroy(&a->h2opus_indexmap));
218   delete a->xx;
219   delete a->yy;
220   delete a->hmatrix_gpu;
221   delete a->dist_hmatrix_gpu;
222 #if defined(PETSC_H2OPUS_USE_GPU)
223   delete a->xx_gpu;
224   delete a->yy_gpu;
225 #endif
226   delete a->sampler;
227   delete a->ptcloud;
228   delete a->kernel;
229   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL));
230   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL));
231   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL));
232   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL));
233   PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL));
234   PetscCall(PetscFree(A->data));
235   PetscFunctionReturn(0);
236 }
237 
238 PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
239 {
240   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
241   PetscBool      ish2opus;
242 
243   PetscFunctionBegin;
244   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
245   PetscValidLogicalCollectiveBool(A,nm,2);
246   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
247   if (ish2opus) {
248     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
249       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
250         PetscLayout t;
251         t = A->rmap;
252         A->rmap = a->h2opus_rmap;
253         a->h2opus_rmap = t;
254         t = A->cmap;
255         A->cmap = a->h2opus_cmap;
256         a->h2opus_cmap = t;
257       }
258     }
259     a->nativemult = nm;
260   }
261   PetscFunctionReturn(0);
262 }
263 
264 PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
265 {
266   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
267   PetscBool      ish2opus;
268 
269   PetscFunctionBegin;
270   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
271   PetscValidPointer(nm,2);
272   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
273   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
274   *nm = a->nativemult;
275   PetscFunctionReturn(0);
276 }
277 
278 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n)
279 {
280   PetscBool      ish2opus;
281   PetscInt       nmax = PETSC_DECIDE;
282   Mat_H2OPUS     *a = NULL;
283   PetscBool      mult = PETSC_FALSE;
284 
285   PetscFunctionBegin;
286   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
287   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
288     a = (Mat_H2OPUS*)A->data;
289 
290     nmax = a->norm_max_samples;
291     mult = a->nativemult;
292     PetscCall(MatH2OpusSetNativeMult(A,PETSC_TRUE));
293   } else {
294     PetscCall(PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL));
295   }
296   PetscCall(MatApproximateNorm_Private(A,normtype,nmax,n));
297   if (a) PetscCall(MatH2OpusSetNativeMult(A,mult));
298   PetscFunctionReturn(0);
299 }
300 
301 static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
302 {
303   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
304   PetscInt       n;
305   PetscBool      boundtocpu = PETSC_TRUE;
306 
307   PetscFunctionBegin;
308 #if defined(PETSC_H2OPUS_USE_GPU)
309   boundtocpu = A->boundtocpu;
310 #endif
311   PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
312   if (boundtocpu) {
313     if (h2opus->xxs < xN) { h2opus->xx->resize(n*xN); h2opus->xxs = xN; }
314     if (h2opus->yys < yN) { h2opus->yy->resize(n*yN); h2opus->yys = yN; }
315   }
316 #if defined(PETSC_H2OPUS_USE_GPU)
317   if (!boundtocpu) {
318     if (h2opus->xxs_gpu < xN) { h2opus->xx_gpu->resize(n*xN); h2opus->xxs_gpu = xN; }
319     if (h2opus->yys_gpu < yN) { h2opus->yy_gpu->resize(n*yN); h2opus->yys_gpu = yN; }
320   }
321 #endif
322   PetscFunctionReturn(0);
323 }
324 
325 static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
326 {
327   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
328 #if defined(H2OPUS_USE_MPI)
329   h2opusHandle_t handle = h2opus->handle->handle;
330 #else
331   h2opusHandle_t handle = h2opus->handle;
332 #endif
333   PetscBool      boundtocpu = PETSC_TRUE;
334   PetscScalar    *xx,*yy,*uxx,*uyy;
335   PetscInt       blda,clda;
336   PetscMPIInt    size;
337   PetscSF        bsf,csf;
338   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
339 
340   PetscFunctionBegin;
341   HLibProfile::clear();
342 #if defined(PETSC_H2OPUS_USE_GPU)
343   boundtocpu = A->boundtocpu;
344 #endif
345   PetscCall(MatDenseGetLDA(B,&blda));
346   PetscCall(MatDenseGetLDA(C,&clda));
347   if (usesf) {
348     PetscInt n;
349 
350     PetscCall(MatDenseGetH2OpusVectorSF(B,h2opus->sf,&bsf));
351     PetscCall(MatDenseGetH2OpusVectorSF(C,h2opus->sf,&csf));
352 
353     PetscCall(MatH2OpusResizeBuffers_Private(A,B->cmap->N,C->cmap->N));
354     PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
355     blda = n;
356     clda = n;
357   }
358   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
359   if (boundtocpu) {
360     PetscCall(MatDenseGetArrayRead(B,(const PetscScalar**)&xx));
361     PetscCall(MatDenseGetArrayWrite(C,&yy));
362     if (usesf) {
363       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx);
364       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy);
365       PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
366       PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
367     } else {
368       uxx = xx;
369       uyy = yy;
370     }
371     if (size > 1) {
372       PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
373       PetscCheck(!transA || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
374 #if defined(H2OPUS_USE_MPI)
375       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
376 #endif
377     } else {
378       PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
379       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
380     }
381     PetscCall(MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx));
382     if (usesf) {
383       PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
384       PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
385     }
386     PetscCall(MatDenseRestoreArrayWrite(C,&yy));
387 #if defined(PETSC_H2OPUS_USE_GPU)
388   } else {
389     PetscBool ciscuda,biscuda;
390 
391     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
392     PetscCall(PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
393     if (!biscuda) {
394       PetscCall(MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B));
395     }
396     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
397     if (!ciscuda) {
398       C->assembled = PETSC_TRUE;
399       PetscCall(MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C));
400     }
401     PetscCall(MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx));
402     PetscCall(MatDenseCUDAGetArrayWrite(C,&yy));
403     if (usesf) {
404       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
405       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
406       PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
407       PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
408     } else {
409       uxx = xx;
410       uyy = yy;
411     }
412     PetscCall(PetscLogGpuTimeBegin());
413     if (size > 1) {
414       PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
415       PetscCheck(!transA || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
416 #if defined(H2OPUS_USE_MPI)
417       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
418 #endif
419     } else {
420       PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
421       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
422     }
423     PetscCall(PetscLogGpuTimeEnd());
424     PetscCall(MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx));
425     if (usesf) {
426       PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
427       PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
428     }
429     PetscCall(MatDenseCUDARestoreArrayWrite(C,&yy));
430     if (!biscuda) {
431       PetscCall(MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B));
432     }
433     if (!ciscuda) {
434       PetscCall(MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C));
435     }
436 #endif
437   }
438   { /* log flops */
439     double gops,time,perf,dev;
440     HLibProfile::getHgemvPerf(gops,time,perf,dev);
441 #if defined(PETSC_H2OPUS_USE_GPU)
442     if (boundtocpu) {
443       PetscCall(PetscLogFlops(1e9*gops));
444     } else {
445       PetscCall(PetscLogGpuFlops(1e9*gops));
446     }
447 #else
448     PetscCall(PetscLogFlops(1e9*gops));
449 #endif
450   }
451   PetscFunctionReturn(0);
452 }
453 
454 static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
455 {
456   Mat_Product    *product = C->product;
457 
458   PetscFunctionBegin;
459   MatCheckProduct(C,1);
460   switch (product->type) {
461   case MATPRODUCT_AB:
462     PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C));
463     break;
464   case MATPRODUCT_AtB:
465     PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C));
466     break;
467   default:
468     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
469   }
470   PetscFunctionReturn(0);
471 }
472 
473 static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
474 {
475   Mat_Product    *product = C->product;
476   PetscBool      cisdense;
477   Mat            A,B;
478 
479   PetscFunctionBegin;
480   MatCheckProduct(C,1);
481   A = product->A;
482   B = product->B;
483   switch (product->type) {
484   case MATPRODUCT_AB:
485     PetscCall(MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N));
486     PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B));
487     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
488     if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name));
489     PetscCall(MatSetUp(C));
490     break;
491   case MATPRODUCT_AtB:
492     PetscCall(MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N));
493     PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B));
494     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
495     if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name));
496     PetscCall(MatSetUp(C));
497     break;
498   default:
499     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
500   }
501   C->ops->productsymbolic = NULL;
502   C->ops->productnumeric = MatProductNumeric_H2OPUS;
503   PetscFunctionReturn(0);
504 }
505 
506 static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
507 {
508   PetscFunctionBegin;
509   MatCheckProduct(C,1);
510   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
511     C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
512   }
513   PetscFunctionReturn(0);
514 }
515 
516 static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
517 {
518   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
519 #if defined(H2OPUS_USE_MPI)
520   h2opusHandle_t handle = h2opus->handle->handle;
521 #else
522   h2opusHandle_t handle = h2opus->handle;
523 #endif
524   PetscBool      boundtocpu = PETSC_TRUE;
525   PetscInt       n;
526   PetscScalar    *xx,*yy,*uxx,*uyy;
527   PetscMPIInt    size;
528   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
529 
530   PetscFunctionBegin;
531   HLibProfile::clear();
532   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
533 #if defined(PETSC_H2OPUS_USE_GPU)
534   boundtocpu = A->boundtocpu;
535 #endif
536   if (usesf) {
537     PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
538   } else n = A->rmap->n;
539   if (boundtocpu) {
540     PetscCall(VecGetArrayRead(x,(const PetscScalar**)&xx));
541     if (sy == 0.0) {
542       PetscCall(VecGetArrayWrite(y,&yy));
543     } else {
544       PetscCall(VecGetArray(y,&yy));
545     }
546     if (usesf) {
547       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
548       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
549 
550       PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
551       PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
552       if (sy != 0.0) {
553         PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
554         PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
555       }
556     } else {
557       uxx = xx;
558       uyy = yy;
559     }
560     if (size > 1) {
561       PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
562       PetscCheck(!trans || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
563 #if defined(H2OPUS_USE_MPI)
564       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
565 #endif
566     } else {
567       PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
568       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
569     }
570     PetscCall(VecRestoreArrayRead(x,(const PetscScalar**)&xx));
571     if (usesf) {
572       PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
573       PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
574     }
575     if (sy == 0.0) {
576       PetscCall(VecRestoreArrayWrite(y,&yy));
577     } else {
578       PetscCall(VecRestoreArray(y,&yy));
579     }
580 #if defined(PETSC_H2OPUS_USE_GPU)
581   } else {
582     PetscCall(VecCUDAGetArrayRead(x,(const PetscScalar**)&xx));
583     if (sy == 0.0) {
584       PetscCall(VecCUDAGetArrayWrite(y,&yy));
585     } else {
586       PetscCall(VecCUDAGetArray(y,&yy));
587     }
588     if (usesf) {
589       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
590       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
591 
592       PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
593       PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
594       if (sy != 0.0) {
595         PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
596         PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
597       }
598     } else {
599       uxx = xx;
600       uyy = yy;
601     }
602     PetscCall(PetscLogGpuTimeBegin());
603     if (size > 1) {
604       PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
605       PetscCheck(!trans || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
606 #if defined(H2OPUS_USE_MPI)
607       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
608 #endif
609     } else {
610       PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
611       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
612     }
613     PetscCall(PetscLogGpuTimeEnd());
614     PetscCall(VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx));
615     if (usesf) {
616       PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
617       PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
618     }
619     if (sy == 0.0) {
620       PetscCall(VecCUDARestoreArrayWrite(y,&yy));
621     } else {
622       PetscCall(VecCUDARestoreArray(y,&yy));
623     }
624 #endif
625   }
626   { /* log flops */
627     double gops,time,perf,dev;
628     HLibProfile::getHgemvPerf(gops,time,perf,dev);
629 #if defined(PETSC_H2OPUS_USE_GPU)
630     if (boundtocpu) {
631       PetscCall(PetscLogFlops(1e9*gops));
632     } else {
633       PetscCall(PetscLogGpuFlops(1e9*gops));
634     }
635 #else
636     PetscCall(PetscLogFlops(1e9*gops));
637 #endif
638   }
639   PetscFunctionReturn(0);
640 }
641 
642 static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
643 {
644   PetscBool      xiscuda,yiscuda;
645 
646   PetscFunctionBegin;
647   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
648   PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,""));
649   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda));
650   PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE));
651   PetscFunctionReturn(0);
652 }
653 
654 static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
655 {
656   PetscBool      xiscuda,yiscuda;
657 
658   PetscFunctionBegin;
659   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
660   PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,""));
661   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda));
662   PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE));
663   PetscFunctionReturn(0);
664 }
665 
666 static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
667 {
668   PetscBool      xiscuda,ziscuda;
669 
670   PetscFunctionBegin;
671   PetscCall(VecCopy(y,z));
672   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
673   PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,""));
674   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda));
675   PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE));
676   PetscFunctionReturn(0);
677 }
678 
679 static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
680 {
681   PetscBool      xiscuda,ziscuda;
682 
683   PetscFunctionBegin;
684   PetscCall(VecCopy(y,z));
685   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
686   PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,""));
687   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda));
688   PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE));
689   PetscFunctionReturn(0);
690 }
691 
692 static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
693 {
694   Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
695 
696   PetscFunctionBegin;
697   a->s *= s;
698   PetscFunctionReturn(0);
699 }
700 
701 static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A)
702 {
703   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
704 
705   PetscFunctionBegin;
706   PetscOptionsHeadBegin(PetscOptionsObject,"H2OPUS options");
707   PetscCall(PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL));
708   PetscCall(PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL));
709   PetscCall(PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL));
710   PetscCall(PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL));
711   PetscCall(PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL));
712   PetscCall(PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL));
713   PetscCall(PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL));
714   PetscCall(PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL));
715   PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL));
716   PetscCall(PetscOptionsBool("-mat_h2opus_resize","Resize after compression",NULL,a->resize,&a->resize,NULL));
717   PetscOptionsHeadEnd();
718   PetscFunctionReturn(0);
719 }
720 
721 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*);
722 
723 static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
724 {
725   Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
726   Vec               c;
727   PetscInt          spacedim;
728   const PetscScalar *coords;
729 
730   PetscFunctionBegin;
731   if (a->ptcloud) PetscFunctionReturn(0);
732   PetscCall(PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c));
733   if (!c && a->sampler) {
734     Mat S = a->sampler->GetSamplingMat();
735 
736     PetscCall(PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c));
737   }
738   if (!c) {
739     PetscCall(MatH2OpusSetCoords_H2OPUS(A,-1,NULL,PETSC_FALSE,NULL,NULL));
740   } else {
741     PetscCall(VecGetArrayRead(c,&coords));
742     PetscCall(VecGetBlockSize(c,&spacedim));
743     PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL));
744     PetscCall(VecRestoreArrayRead(c,&coords));
745   }
746   PetscFunctionReturn(0);
747 }
748 
749 static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
750 {
751   MPI_Comm       comm;
752   PetscMPIInt    size;
753   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
754   PetscInt       n = 0,*idx = NULL;
755   int            *iidx = NULL;
756   PetscCopyMode  own;
757   PetscBool      rid;
758 
759   PetscFunctionBegin;
760   if (a->multsetup) PetscFunctionReturn(0);
761   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
762     PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL));
763 #if defined(PETSC_H2OPUS_USE_GPU)
764     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
765     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
766     a->xxs_gpu = 1;
767     a->yys_gpu = 1;
768 #endif
769     a->xx  = new thrust::host_vector<PetscScalar>(n);
770     a->yy  = new thrust::host_vector<PetscScalar>(n);
771     a->xxs = 1;
772     a->yys = 1;
773   } else {
774     IS is;
775     PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
776     PetscCallMPI(MPI_Comm_size(comm,&size));
777     if (!a->h2opus_indexmap) {
778       if (size > 1) {
779         PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
780 #if defined(H2OPUS_USE_MPI)
781         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
782         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
783 #endif
784       } else {
785         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
786         n    = a->hmatrix->u_basis_tree.index_map.size();
787       }
788 
789       if (PetscDefined(USE_64BIT_INDICES)) {
790         PetscInt i;
791 
792         own  = PETSC_OWN_POINTER;
793         PetscCall(PetscMalloc1(n,&idx));
794         for (i=0;i<n;i++) idx[i] = iidx[i];
795       } else {
796         own  = PETSC_COPY_VALUES;
797         idx  = (PetscInt*)iidx;
798       }
799       PetscCall(ISCreateGeneral(comm,n,idx,own,&is));
800       PetscCall(ISSetPermutation(is));
801       PetscCall(ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view"));
802       a->h2opus_indexmap = is;
803     }
804     PetscCall(ISGetLocalSize(a->h2opus_indexmap,&n));
805     PetscCall(ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx));
806     rid  = (PetscBool)(n == A->rmap->n);
807     PetscCall(MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm));
808     if (rid) {
809       PetscCall(ISIdentity(a->h2opus_indexmap,&rid));
810     }
811     if (!rid) {
812       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
813         PetscCall(PetscLayoutCreate(comm,&a->h2opus_rmap));
814         PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap,n));
815         PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
816         PetscCall(PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap));
817       }
818       PetscCall(PetscSFCreate(comm,&a->sf));
819       PetscCall(PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx));
820       PetscCall(PetscSFSetUp(a->sf));
821       PetscCall(PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view"));
822 #if defined(PETSC_H2OPUS_USE_GPU)
823       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
824       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
825       a->xxs_gpu = 1;
826       a->yys_gpu = 1;
827 #endif
828       a->xx  = new thrust::host_vector<PetscScalar>(n);
829       a->yy  = new thrust::host_vector<PetscScalar>(n);
830       a->xxs = 1;
831       a->yys = 1;
832     }
833     PetscCall(ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx));
834   }
835   a->multsetup = PETSC_TRUE;
836   PetscFunctionReturn(0);
837 }
838 
839 static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
840 {
841   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
842 #if defined(H2OPUS_USE_MPI)
843   h2opusHandle_t handle = a->handle->handle;
844 #else
845   h2opusHandle_t handle = a->handle;
846 #endif
847   PetscBool      kernel = PETSC_FALSE;
848   PetscBool      boundtocpu = PETSC_TRUE;
849   PetscBool      samplingdone = PETSC_FALSE;
850   MPI_Comm       comm;
851   PetscMPIInt    size;
852 
853   PetscFunctionBegin;
854   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
855   PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
856   PetscCheck(A->rmap->N == A->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
857 
858   /* XXX */
859   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
860 
861   PetscCallMPI(MPI_Comm_size(comm,&size));
862   /* TODO REUSABILITY of geometric construction */
863   delete a->hmatrix;
864   delete a->dist_hmatrix;
865 #if defined(PETSC_H2OPUS_USE_GPU)
866   delete a->hmatrix_gpu;
867   delete a->dist_hmatrix_gpu;
868 #endif
869   a->orthogonal = PETSC_FALSE;
870 
871   /* TODO: other? */
872   H2OpusBoxCenterAdmissibility adm(a->eta);
873 
874   PetscCall(PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0));
875   if (size > 1) {
876 #if defined(H2OPUS_USE_MPI)
877     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/* ,A->symmetric */);
878 #else
879     a->dist_hmatrix = NULL;
880 #endif
881   } else {
882     a->hmatrix = new HMatrix(A->rmap->n,A->symmetric);
883   }
884   PetscCall(MatH2OpusInferCoordinates_Private(A));
885   PetscCheck(a->ptcloud,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
886   if (a->kernel) {
887     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
888     if (size > 1) {
889       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
890 #if defined(H2OPUS_USE_MPI)
891       buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle);
892 #endif
893     } else {
894       buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord);
895     }
896     kernel = PETSC_TRUE;
897   } else {
898     PetscCheck(size <= 1,comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
899     buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm);
900   }
901   PetscCall(MatSetUpMultiply_H2OPUS(A));
902 
903 #if defined(PETSC_H2OPUS_USE_GPU)
904   boundtocpu = A->boundtocpu;
905   if (!boundtocpu) {
906     if (size > 1) {
907       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
908 #if defined(H2OPUS_USE_MPI)
909       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
910 #endif
911     } else {
912       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
913     }
914   }
915 #endif
916   if (size == 1) {
917     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
918       PetscReal Anorm;
919       bool      verbose;
920 
921       PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL));
922       verbose = a->hara_verbose;
923       PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm));
924       if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs));
925       if (a->sf && !a->nativemult) {
926         a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data());
927       }
928       a->sampler->SetStream(handle->getMainStream());
929       if (boundtocpu) {
930         a->sampler->SetGPUSampling(false);
931         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
932 #if defined(PETSC_H2OPUS_USE_GPU)
933       } else {
934         a->sampler->SetGPUSampling(true);
935         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
936 #endif
937       }
938       samplingdone = PETSC_TRUE;
939     }
940   }
941 #if defined(PETSC_H2OPUS_USE_GPU)
942   if (!boundtocpu) {
943     delete a->hmatrix;
944     delete a->dist_hmatrix;
945     a->hmatrix = NULL;
946     a->dist_hmatrix = NULL;
947   }
948   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
949 #endif
950   PetscCall(PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0));
951 
952   if (!a->s) a->s = 1.0;
953   A->assembled = PETSC_TRUE;
954 
955   if (samplingdone) {
956     PetscBool check = a->check_construction;
957     PetscBool checke = PETSC_FALSE;
958 
959     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL));
960     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL));
961     if (check) {
962       Mat       E,Ae;
963       PetscReal n1,ni,n2;
964       PetscReal n1A,niA,n2A;
965       void      (*normfunc)(void);
966 
967       Ae   = a->sampler->GetSamplingMat();
968       PetscCall(MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E));
969       PetscCall(MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
970       PetscCall(MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN));
971       PetscCall(MatNorm(E,NORM_1,&n1));
972       PetscCall(MatNorm(E,NORM_INFINITY,&ni));
973       PetscCall(MatNorm(E,NORM_2,&n2));
974       if (checke) {
975         Mat eA,eE,eAe;
976 
977         PetscCall(MatComputeOperator(A,MATAIJ,&eA));
978         PetscCall(MatComputeOperator(E,MATAIJ,&eE));
979         PetscCall(MatComputeOperator(Ae,MATAIJ,&eAe));
980         PetscCall(MatChop(eA,PETSC_SMALL));
981         PetscCall(MatChop(eE,PETSC_SMALL));
982         PetscCall(MatChop(eAe,PETSC_SMALL));
983         PetscCall(PetscObjectSetName((PetscObject)eA,"H2Mat"));
984         PetscCall(MatView(eA,NULL));
985         PetscCall(PetscObjectSetName((PetscObject)eAe,"S"));
986         PetscCall(MatView(eAe,NULL));
987         PetscCall(PetscObjectSetName((PetscObject)eE,"H2Mat - S"));
988         PetscCall(MatView(eE,NULL));
989         PetscCall(MatDestroy(&eA));
990         PetscCall(MatDestroy(&eE));
991         PetscCall(MatDestroy(&eAe));
992       }
993 
994       PetscCall(MatGetOperation(Ae,MATOP_NORM,&normfunc));
995       PetscCall(MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
996       PetscCall(MatNorm(Ae,NORM_1,&n1A));
997       PetscCall(MatNorm(Ae,NORM_INFINITY,&niA));
998       PetscCall(MatNorm(Ae,NORM_2,&n2A));
999       n1A  = PetscMax(n1A,PETSC_SMALL);
1000       n2A  = PetscMax(n2A,PETSC_SMALL);
1001       niA  = PetscMax(niA,PETSC_SMALL);
1002       PetscCall(MatSetOperation(Ae,MATOP_NORM,normfunc));
1003       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A)));
1004       PetscCall(MatDestroy(&E));
1005     }
1006     a->sampler->SetSamplingMat(NULL);
1007   }
1008   PetscFunctionReturn(0);
1009 }
1010 
1011 static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1012 {
1013   PetscMPIInt    size;
1014   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1015 
1016   PetscFunctionBegin;
1017   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1018   PetscCheck(size <= 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported");
1019   a->hmatrix->clearData();
1020 #if defined(PETSC_H2OPUS_USE_GPU)
1021   if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1022 #endif
1023   PetscFunctionReturn(0);
1024 }
1025 
1026 static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1027 {
1028   Mat            A;
1029   Mat_H2OPUS     *a, *b = (Mat_H2OPUS*)B->data;
1030 #if defined(PETSC_H2OPUS_USE_GPU)
1031   PetscBool      iscpu = PETSC_FALSE;
1032 #else
1033   PetscBool      iscpu = PETSC_TRUE;
1034 #endif
1035   MPI_Comm       comm;
1036 
1037   PetscFunctionBegin;
1038   PetscCall(PetscObjectGetComm((PetscObject)B,&comm));
1039   PetscCall(MatCreate(comm,&A));
1040   PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N));
1041   PetscCall(MatSetType(A,MATH2OPUS));
1042   PetscCall(MatPropagateSymmetryOptions(B,A));
1043   a = (Mat_H2OPUS*)A->data;
1044 
1045   a->eta              = b->eta;
1046   a->leafsize         = b->leafsize;
1047   a->basisord         = b->basisord;
1048   a->max_rank         = b->max_rank;
1049   a->bs               = b->bs;
1050   a->rtol             = b->rtol;
1051   a->norm_max_samples = b->norm_max_samples;
1052   if (op == MAT_COPY_VALUES) a->s = b->s;
1053 
1054   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1055   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1056 
1057 #if defined(H2OPUS_USE_MPI)
1058   if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1059 #if defined(PETSC_H2OPUS_USE_GPU)
1060   if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1061 #endif
1062 #endif
1063   if (b->hmatrix) {
1064     a->hmatrix = new HMatrix(*b->hmatrix);
1065     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1066   }
1067 #if defined(PETSC_H2OPUS_USE_GPU)
1068   if (b->hmatrix_gpu) {
1069     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1070     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1071   }
1072 #endif
1073   if (b->sf) {
1074     PetscCall(PetscObjectReference((PetscObject)b->sf));
1075     a->sf = b->sf;
1076   }
1077   if (b->h2opus_indexmap) {
1078     PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1079     a->h2opus_indexmap = b->h2opus_indexmap;
1080   }
1081 
1082   PetscCall(MatSetUp(A));
1083   PetscCall(MatSetUpMultiply_H2OPUS(A));
1084   if (op == MAT_COPY_VALUES) {
1085     A->assembled = PETSC_TRUE;
1086     a->orthogonal = b->orthogonal;
1087 #if defined(PETSC_H2OPUS_USE_GPU)
1088     A->offloadmask = B->offloadmask;
1089 #endif
1090   }
1091 #if defined(PETSC_H2OPUS_USE_GPU)
1092   iscpu = B->boundtocpu;
1093 #endif
1094   PetscCall(MatBindToCPU(A,iscpu));
1095 
1096   *nA = A;
1097   PetscFunctionReturn(0);
1098 }
1099 
1100 static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1101 {
1102   Mat_H2OPUS        *h2opus = (Mat_H2OPUS*)A->data;
1103   PetscBool         isascii, vieweps;
1104   PetscMPIInt       size;
1105   PetscViewerFormat format;
1106 
1107   PetscFunctionBegin;
1108   PetscCall(PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii));
1109   PetscCall(PetscViewerGetFormat(view,&format));
1110   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1111   if (isascii) {
1112     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1113       if (size == 1) {
1114         FILE *fp;
1115         PetscCall(PetscViewerASCIIGetPointer(view,&fp));
1116         dumpHMatrix(*h2opus->hmatrix,6,fp);
1117       }
1118     } else {
1119       PetscCall(PetscViewerASCIIPrintf(view,"  H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat"));
1120       PetscCall(PetscViewerASCIIPrintf(view,"  PointCloud dim %" PetscInt_FMT "\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1121       PetscCall(PetscViewerASCIIPrintf(view,"  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n",h2opus->leafsize,(double)h2opus->eta));
1122       if (!h2opus->kernel) {
1123         PetscCall(PetscViewerASCIIPrintf(view,"  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol));
1124       } else {
1125         PetscCall(PetscViewerASCIIPrintf(view,"  Offdiagonal blocks approximation order %" PetscInt_FMT "\n",h2opus->basisord));
1126       }
1127       PetscCall(PetscViewerASCIIPrintf(view,"  Number of samples for norms %" PetscInt_FMT "\n",h2opus->norm_max_samples));
1128       if (size == 1) {
1129         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1130         double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1131 #if defined(PETSC_HAVE_CUDA)
1132         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1133         double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1134 #endif
1135         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1136 #if defined(PETSC_HAVE_CUDA)
1137         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1138 #endif
1139       } else {
1140 #if defined(PETSC_HAVE_CUDA)
1141         double      matrix_mem[4] = {0.,0.,0.,0.};
1142         PetscMPIInt rsize = 4;
1143 #else
1144         double      matrix_mem[2] = {0.,0.};
1145         PetscMPIInt rsize = 2;
1146 #endif
1147 #if defined(H2OPUS_USE_MPI)
1148         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1149         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1150 #if defined(PETSC_HAVE_CUDA)
1151         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1152         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1153 #endif
1154 #endif
1155         PetscCall(MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A)));
1156         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1157 #if defined(PETSC_HAVE_CUDA)
1158         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1159 #endif
1160       }
1161     }
1162   }
1163   vieweps = PETSC_FALSE;
1164   PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_vieweps",&vieweps,NULL));
1165   if (vieweps) {
1166     char filename[256];
1167     const char *name;
1168 
1169     PetscCall(PetscObjectGetName((PetscObject)A,&name));
1170     PetscCall(PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name));
1171     PetscCall(PetscOptionsGetString(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_vieweps_filename",filename,sizeof(filename),NULL));
1172     outputEps(*h2opus->hmatrix,filename);
1173   }
1174   PetscFunctionReturn(0);
1175 }
1176 
1177 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1178 {
1179   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
1180   PetscReal      *gcoords;
1181   PetscInt       N;
1182   MPI_Comm       comm;
1183   PetscMPIInt    size;
1184   PetscBool      cong;
1185 
1186   PetscFunctionBegin;
1187   PetscCall(PetscLayoutSetUp(A->rmap));
1188   PetscCall(PetscLayoutSetUp(A->cmap));
1189   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
1190   PetscCall(MatHasCongruentLayouts(A,&cong));
1191   PetscCheck(cong,comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts");
1192   N = A->rmap->N;
1193   PetscCallMPI(MPI_Comm_size(comm,&size));
1194   if (spacedim > 0 && size > 1 && cdist) {
1195     PetscSF      sf;
1196     MPI_Datatype dtype;
1197 
1198     PetscCallMPI(MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype));
1199     PetscCallMPI(MPI_Type_commit(&dtype));
1200 
1201     PetscCall(PetscSFCreate(comm,&sf));
1202     PetscCall(PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER));
1203     PetscCall(PetscMalloc1(spacedim*N,&gcoords));
1204     PetscCall(PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE));
1205     PetscCall(PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE));
1206     PetscCall(PetscSFDestroy(&sf));
1207     PetscCallMPI(MPI_Type_free(&dtype));
1208   } else gcoords = (PetscReal*)coords;
1209 
1210   delete h2opus->ptcloud;
1211   delete h2opus->kernel;
1212   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords);
1213   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx);
1214   if (gcoords != coords) PetscCall(PetscFree(gcoords));
1215   A->preallocated = PETSC_TRUE;
1216   PetscFunctionReturn(0);
1217 }
1218 
1219 #if defined(PETSC_H2OPUS_USE_GPU)
1220 static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1221 {
1222   PetscMPIInt    size;
1223   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1224 
1225   PetscFunctionBegin;
1226   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1227   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1228     if (size > 1) {
1229       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1230 #if defined(H2OPUS_USE_MPI)
1231       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1232       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1233 #endif
1234     } else {
1235       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1236       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1237       else *a->hmatrix = *a->hmatrix_gpu;
1238     }
1239     delete a->hmatrix_gpu;
1240     delete a->dist_hmatrix_gpu;
1241     a->hmatrix_gpu = NULL;
1242     a->dist_hmatrix_gpu = NULL;
1243     A->offloadmask = PETSC_OFFLOAD_CPU;
1244   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1245     if (size > 1) {
1246       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1247 #if defined(H2OPUS_USE_MPI)
1248       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1249       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1250 #endif
1251     } else {
1252       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1253       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1254       else *a->hmatrix_gpu = *a->hmatrix;
1255     }
1256     delete a->hmatrix;
1257     delete a->dist_hmatrix;
1258     a->hmatrix = NULL;
1259     a->dist_hmatrix = NULL;
1260     A->offloadmask = PETSC_OFFLOAD_GPU;
1261   }
1262   PetscCall(PetscFree(A->defaultvectype));
1263   if (!flg) {
1264     PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
1265   } else {
1266     PetscCall(PetscStrallocpy(VECSTANDARD,&A->defaultvectype));
1267   }
1268   A->boundtocpu = flg;
1269   PetscFunctionReturn(0);
1270 }
1271 #endif
1272 
1273 /*MC
1274      MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.
1275 
1276    Options Database Keys:
1277 .     -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()
1278 
1279    Notes:
1280      H2Opus implements hierarchical matrices in the H^2 flavour.
1281      It supports CPU or NVIDIA GPUs.
1282      For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1283      In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.
1284      For details and additional references, see
1285        "H2Opus: A distributed-memory multi-GPU software package for non-local operators",
1286      available at https://arxiv.org/abs/2109.05451.
1287 
1288    Level: beginner
1289 
1290 .seealso: `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1291 M*/
1292 PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1293 {
1294   Mat_H2OPUS     *a;
1295   PetscMPIInt    size;
1296 
1297   PetscFunctionBegin;
1298 #if defined(PETSC_H2OPUS_USE_GPU)
1299   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1300 #endif
1301   PetscCall(PetscNewLog(A,&a));
1302   A->data = (void*)a;
1303 
1304   a->eta              = 0.9;
1305   a->leafsize         = 32;
1306   a->basisord         = 4;
1307   a->max_rank         = 64;
1308   a->bs               = 32;
1309   a->rtol             = 1.e-4;
1310   a->s                = 1.0;
1311   a->norm_max_samples = 10;
1312   a->resize           = PETSC_TRUE; /* reallocate after compression */
1313 #if defined(H2OPUS_USE_MPI)
1314   h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));
1315 #else
1316   h2opusCreateHandle(&a->handle);
1317 #endif
1318   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1319   PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS));
1320   PetscCall(PetscMemzero(A->ops,sizeof(struct _MatOps)));
1321 
1322   A->ops->destroy          = MatDestroy_H2OPUS;
1323   A->ops->view             = MatView_H2OPUS;
1324   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1325   A->ops->mult             = MatMult_H2OPUS;
1326   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1327   A->ops->multadd          = MatMultAdd_H2OPUS;
1328   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1329   A->ops->scale            = MatScale_H2OPUS;
1330   A->ops->duplicate        = MatDuplicate_H2OPUS;
1331   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1332   A->ops->norm             = MatNorm_H2OPUS;
1333   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1334 #if defined(PETSC_H2OPUS_USE_GPU)
1335   A->ops->bindtocpu        = MatBindToCPU_H2OPUS;
1336 #endif
1337 
1338   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS));
1339   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS));
1340   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS));
1341   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS));
1342 #if defined(PETSC_H2OPUS_USE_GPU)
1343   PetscCall(PetscFree(A->defaultvectype));
1344   PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
1345 #endif
1346   PetscFunctionReturn(0);
1347 }
1348 
1349 /*@C
1350      MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1351 
1352    Input Parameter:
1353 .     A - the matrix
1354 
1355    Level: intermediate
1356 
1357 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1358 @*/
1359 PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1360 {
1361   PetscBool      ish2opus;
1362   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1363   PetscMPIInt    size;
1364   PetscBool      boundtocpu = PETSC_TRUE;
1365 
1366   PetscFunctionBegin;
1367   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1368   PetscValidType(A,1);
1369   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1370   if (!ish2opus) PetscFunctionReturn(0);
1371   if (a->orthogonal) PetscFunctionReturn(0);
1372   HLibProfile::clear();
1373   PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0));
1374 #if defined(PETSC_H2OPUS_USE_GPU)
1375   boundtocpu = A->boundtocpu;
1376 #endif
1377   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1378   if (size > 1) {
1379     if (boundtocpu) {
1380       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1381 #if defined(H2OPUS_USE_MPI)
1382       distributed_horthog(*a->dist_hmatrix, a->handle);
1383 #endif
1384 #if defined(PETSC_H2OPUS_USE_GPU)
1385       A->offloadmask = PETSC_OFFLOAD_CPU;
1386     } else {
1387       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1388       PetscCall(PetscLogGpuTimeBegin());
1389 #if defined(H2OPUS_USE_MPI)
1390       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1391 #endif
1392       PetscCall(PetscLogGpuTimeEnd());
1393 #endif
1394     }
1395   } else {
1396 #if defined(H2OPUS_USE_MPI)
1397     h2opusHandle_t handle = a->handle->handle;
1398 #else
1399     h2opusHandle_t handle = a->handle;
1400 #endif
1401     if (boundtocpu) {
1402       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1403       horthog(*a->hmatrix, handle);
1404 #if defined(PETSC_H2OPUS_USE_GPU)
1405       A->offloadmask = PETSC_OFFLOAD_CPU;
1406     } else {
1407       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1408       PetscCall(PetscLogGpuTimeBegin());
1409       horthog(*a->hmatrix_gpu, handle);
1410       PetscCall(PetscLogGpuTimeEnd());
1411 #endif
1412     }
1413   }
1414   a->orthogonal = PETSC_TRUE;
1415   { /* log flops */
1416     double gops,time,perf,dev;
1417     HLibProfile::getHorthogPerf(gops,time,perf,dev);
1418 #if defined(PETSC_H2OPUS_USE_GPU)
1419     if (boundtocpu) {
1420       PetscCall(PetscLogFlops(1e9*gops));
1421     } else {
1422       PetscCall(PetscLogGpuFlops(1e9*gops));
1423     }
1424 #else
1425     PetscCall(PetscLogFlops(1e9*gops));
1426 #endif
1427   }
1428   PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0));
1429   PetscFunctionReturn(0);
1430 }
1431 
1432 /*@C
1433      MatH2OpusCompress - Compress a hierarchical matrix.
1434 
1435    Input Parameters:
1436 +     A - the matrix
1437 -     tol - the absolute truncation threshold
1438 
1439    Level: intermediate
1440 
1441 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1442 @*/
1443 PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1444 {
1445   PetscBool      ish2opus;
1446   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1447   PetscMPIInt    size;
1448   PetscBool      boundtocpu = PETSC_TRUE;
1449 
1450   PetscFunctionBegin;
1451   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1452   PetscValidType(A,1);
1453   PetscValidLogicalCollectiveReal(A,tol,2);
1454   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1455   if (!ish2opus || tol <= 0.0) PetscFunctionReturn(0);
1456   PetscCall(MatH2OpusOrthogonalize(A));
1457   HLibProfile::clear();
1458   PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0));
1459 #if defined(PETSC_H2OPUS_USE_GPU)
1460   boundtocpu = A->boundtocpu;
1461 #endif
1462   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1463   if (size > 1) {
1464     if (boundtocpu) {
1465       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1466 #if defined(H2OPUS_USE_MPI)
1467       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1468       if (a->resize) {
1469         DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1470         delete a->dist_hmatrix;
1471         a->dist_hmatrix = dist_hmatrix;
1472       }
1473 #endif
1474 #if defined(PETSC_H2OPUS_USE_GPU)
1475       A->offloadmask = PETSC_OFFLOAD_CPU;
1476     } else {
1477       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1478       PetscCall(PetscLogGpuTimeBegin());
1479 #if defined(H2OPUS_USE_MPI)
1480       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1481 
1482       if (a->resize) {
1483         DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1484         delete a->dist_hmatrix_gpu;
1485         a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1486       }
1487 #endif
1488       PetscCall(PetscLogGpuTimeEnd());
1489 #endif
1490     }
1491   } else {
1492 #if defined(H2OPUS_USE_MPI)
1493     h2opusHandle_t handle = a->handle->handle;
1494 #else
1495     h2opusHandle_t handle = a->handle;
1496 #endif
1497     if (boundtocpu) {
1498       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1499       hcompress(*a->hmatrix, tol, handle);
1500 
1501       if (a->resize) {
1502         HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1503         delete a->hmatrix;
1504         a->hmatrix = hmatrix;
1505       }
1506 #if defined(PETSC_H2OPUS_USE_GPU)
1507       A->offloadmask = PETSC_OFFLOAD_CPU;
1508     } else {
1509       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1510       PetscCall(PetscLogGpuTimeBegin());
1511       hcompress(*a->hmatrix_gpu, tol, handle);
1512       PetscCall(PetscLogGpuTimeEnd());
1513 
1514       if (a->resize) {
1515         HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1516         delete a->hmatrix_gpu;
1517         a->hmatrix_gpu = hmatrix_gpu;
1518       }
1519 #endif
1520     }
1521   }
1522   { /* log flops */
1523     double gops,time,perf,dev;
1524     HLibProfile::getHcompressPerf(gops,time,perf,dev);
1525 #if defined(PETSC_H2OPUS_USE_GPU)
1526     if (boundtocpu) {
1527       PetscCall(PetscLogFlops(1e9*gops));
1528     } else {
1529       PetscCall(PetscLogGpuFlops(1e9*gops));
1530     }
1531 #else
1532     PetscCall(PetscLogFlops(1e9*gops));
1533 #endif
1534   }
1535   PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0));
1536   PetscFunctionReturn(0);
1537 }
1538 
1539 /*@C
1540      MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.
1541 
1542    Input Parameters:
1543 +     A - the hierarchical matrix
1544 .     B - the matrix to be sampled
1545 .     bs - maximum number of samples to be taken concurrently
1546 -     tol - relative tolerance for construction
1547 
1548    Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix.
1549 
1550    Level: intermediate
1551 
1552 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1553 @*/
1554 PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1555 {
1556   PetscBool      ish2opus;
1557 
1558   PetscFunctionBegin;
1559   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1560   PetscValidType(A,1);
1561   if (B) PetscValidHeaderSpecific(B,MAT_CLASSID,2);
1562   PetscValidLogicalCollectiveInt(A,bs,3);
1563   PetscValidLogicalCollectiveReal(A,tol,3);
1564   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1565   if (ish2opus) {
1566     Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1567 
1568     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1569     a->sampler->SetSamplingMat(B);
1570     if (bs > 0) a->bs = bs;
1571     if (tol > 0.) a->rtol = tol;
1572     delete a->kernel;
1573   }
1574   PetscFunctionReturn(0);
1575 }
1576 
1577 /*@C
1578      MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel.
1579 
1580    Input Parameters:
1581 +     comm - MPI communicator
1582 .     m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1583 .     n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1584 .     M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1585 .     N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1586 .     spacedim - dimension of the space coordinates
1587 .     coords - coordinates of the points
1588 .     cdist - whether or not coordinates are distributed
1589 .     kernel - computational kernel (or NULL)
1590 .     kernelctx - kernel context
1591 .     eta - admissibility condition tolerance
1592 .     leafsize - leaf size in cluster tree
1593 -     basisord - approximation order for Chebychev interpolation of low-rank blocks
1594 
1595    Output Parameter:
1596 .     nA - matrix
1597 
1598    Options Database Keys:
1599 +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1600 .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1601 .     -mat_h2opus_order <PetscInt> - Chebychev approximation order
1602 -     -mat_h2opus_normsamples <PetscInt> - Maximum number of samples to be used when estimating norms
1603 
1604    Level: intermediate
1605 
1606 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1607 @*/
1608 PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1609 {
1610   Mat            A;
1611   Mat_H2OPUS     *h2opus;
1612 #if defined(PETSC_H2OPUS_USE_GPU)
1613   PetscBool      iscpu = PETSC_FALSE;
1614 #else
1615   PetscBool      iscpu = PETSC_TRUE;
1616 #endif
1617 
1618   PetscFunctionBegin;
1619   PetscCheck(m == n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1620   PetscCall(MatCreate(comm,&A));
1621   PetscCall(MatSetSizes(A,m,n,M,N));
1622   PetscCheck(M == N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1623   PetscCall(MatSetType(A,MATH2OPUS));
1624   PetscCall(MatBindToCPU(A,iscpu));
1625   PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx));
1626 
1627   h2opus = (Mat_H2OPUS*)A->data;
1628   if (eta > 0.) h2opus->eta = eta;
1629   if (leafsize > 0) h2opus->leafsize = leafsize;
1630   if (basisord > 0) h2opus->basisord = basisord;
1631 
1632   *nA = A;
1633   PetscFunctionReturn(0);
1634 }
1635 
1636 /*@C
1637      MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator.
1638 
1639    Input Parameters:
1640 +     B - the matrix to be sampled
1641 .     spacedim - dimension of the space coordinates
1642 .     coords - coordinates of the points
1643 .     cdist - whether or not coordinates are distributed
1644 .     eta - admissibility condition tolerance
1645 .     leafsize - leaf size in cluster tree
1646 .     maxrank - maximum rank allowed
1647 .     bs - maximum number of samples to be taken concurrently
1648 -     rtol - relative tolerance for construction
1649 
1650    Output Parameter:
1651 .     nA - matrix
1652 
1653    Options Database Keys:
1654 +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1655 .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1656 .     -mat_h2opus_maxrank <PetscInt> - Maximum rank when constructed from matvecs
1657 .     -mat_h2opus_samples <PetscInt> - Maximum number of samples to be taken concurrently when constructing from matvecs
1658 .     -mat_h2opus_rtol <PetscReal> - Relative tolerance for construction from sampling
1659 .     -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd()
1660 .     -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction
1661 -     -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms
1662 
1663    Notes: not available in parallel
1664 
1665    Level: intermediate
1666 
1667 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1668 @*/
1669 PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1670 {
1671   Mat            A;
1672   Mat_H2OPUS     *h2opus;
1673   MPI_Comm       comm;
1674   PetscBool      boundtocpu = PETSC_TRUE;
1675 
1676   PetscFunctionBegin;
1677   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
1678   PetscValidLogicalCollectiveInt(B,spacedim,2);
1679   PetscValidLogicalCollectiveBool(B,cdist,4);
1680   PetscValidLogicalCollectiveReal(B,eta,5);
1681   PetscValidLogicalCollectiveInt(B,leafsize,6);
1682   PetscValidLogicalCollectiveInt(B,maxrank,7);
1683   PetscValidLogicalCollectiveInt(B,bs,8);
1684   PetscValidLogicalCollectiveReal(B,rtol,9);
1685   PetscValidPointer(nA,10);
1686   PetscCall(PetscObjectGetComm((PetscObject)B,&comm));
1687   PetscCheck(B->rmap->n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1688   PetscCheck(B->rmap->N == B->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1689   PetscCall(MatCreate(comm,&A));
1690   PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N));
1691 #if defined(PETSC_H2OPUS_USE_GPU)
1692   {
1693     PetscBool iscuda;
1694     VecType   vtype;
1695 
1696     PetscCall(MatGetVecType(B,&vtype));
1697     PetscCall(PetscStrcmp(vtype,VECCUDA,&iscuda));
1698     if (!iscuda) {
1699       PetscCall(PetscStrcmp(vtype,VECSEQCUDA,&iscuda));
1700       if (!iscuda) {
1701         PetscCall(PetscStrcmp(vtype,VECMPICUDA,&iscuda));
1702       }
1703     }
1704     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1705   }
1706 #endif
1707   PetscCall(MatSetType(A,MATH2OPUS));
1708   PetscCall(MatBindToCPU(A,boundtocpu));
1709   if (spacedim) {
1710     PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL));
1711   }
1712   PetscCall(MatPropagateSymmetryOptions(B,A));
1713   /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1714 
1715   h2opus = (Mat_H2OPUS*)A->data;
1716   h2opus->sampler = new PetscMatrixSampler(B);
1717   if (eta > 0.) h2opus->eta = eta;
1718   if (leafsize > 0) h2opus->leafsize = leafsize;
1719   if (maxrank > 0) h2opus->max_rank = maxrank;
1720   if (bs > 0) h2opus->bs = bs;
1721   if (rtol > 0.) h2opus->rtol = rtol;
1722   *nA = A;
1723   A->preallocated = PETSC_TRUE;
1724   PetscFunctionReturn(0);
1725 }
1726 
1727 /*@C
1728      MatH2OpusGetIndexMap - Access reordering index set.
1729 
1730    Input Parameters:
1731 .     A - the matrix
1732 
1733    Output Parameter:
1734 .     indexmap - the index set for the reordering
1735 
1736    Level: intermediate
1737 
1738 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1739 @*/
1740 PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1741 {
1742   PetscBool      ish2opus;
1743   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1744 
1745   PetscFunctionBegin;
1746   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1747   PetscValidType(A,1);
1748   PetscValidPointer(indexmap,2);
1749   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1750   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1751   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1752   *indexmap = a->h2opus_indexmap;
1753   PetscFunctionReturn(0);
1754 }
1755 
1756 /*@C
1757      MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1758 
1759    Input Parameters:
1760 +     A - the matrix
1761 .     nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1762 -     in - the vector to be mapped
1763 
1764    Output Parameter:
1765 .     out - the newly created mapped vector
1766 
1767    Level: intermediate
1768 
1769 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1770 @*/
1771 PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out)
1772 {
1773   PetscBool      ish2opus;
1774   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1775   PetscScalar    *xin,*xout;
1776   PetscBool      nm;
1777 
1778   PetscFunctionBegin;
1779   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1780   PetscValidType(A,1);
1781   PetscValidLogicalCollectiveBool(A,nativetopetsc,2);
1782   PetscValidHeaderSpecific(in,VEC_CLASSID,3);
1783   PetscValidPointer(out,4);
1784   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1785   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1786   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1787   nm   = a->nativemult;
1788   PetscCall(MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc));
1789   PetscCall(MatCreateVecs(A,out,NULL));
1790   PetscCall(MatH2OpusSetNativeMult(A,nm));
1791   if (!a->sf) { /* same ordering */
1792     PetscCall(VecCopy(in,*out));
1793     PetscFunctionReturn(0);
1794   }
1795   PetscCall(VecGetArrayRead(in,(const PetscScalar**)&xin));
1796   PetscCall(VecGetArrayWrite(*out,&xout));
1797   if (nativetopetsc) {
1798     PetscCall(PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1799     PetscCall(PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1800   } else {
1801     PetscCall(PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1802     PetscCall(PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1803   }
1804   PetscCall(VecRestoreArrayRead(in,(const PetscScalar**)&xin));
1805   PetscCall(VecRestoreArrayWrite(*out,&xout));
1806   PetscFunctionReturn(0);
1807 }
1808 
1809 /*@C
1810      MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T
1811 
1812    Input Parameters:
1813 +     A - the hierarchical matrix
1814 .     s - the scaling factor
1815 .     U - the dense low-rank update matrix
1816 -     V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed)
1817 
1818    Notes: The U and V matrices must be in dense format
1819 
1820    Level: intermediate
1821 
1822 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1823 @*/
1824 PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1825 {
1826   PetscBool      flg;
1827 
1828   PetscFunctionBegin;
1829   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1830   PetscValidType(A,1);
1831   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1832   PetscValidHeaderSpecific(U,MAT_CLASSID,2);
1833   PetscCheckSameComm(A,1,U,2);
1834   if (V) {
1835     PetscValidHeaderSpecific(V,MAT_CLASSID,3);
1836     PetscCheckSameComm(A,1,V,3);
1837   }
1838   PetscValidLogicalCollectiveScalar(A,s,4);
1839 
1840   if (!V) V = U;
1841   PetscCheck(U->cmap->N == V->cmap->N,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT,U->cmap->N,V->cmap->N);
1842   if (!U->cmap->N) PetscFunctionReturn(0);
1843   PetscCall(PetscLayoutCompare(U->rmap,A->rmap,&flg));
1844   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A and U must have the same row layout");
1845   PetscCall(PetscLayoutCompare(V->rmap,A->cmap,&flg));
1846   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A column layout must match V row column layout");
1847   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&flg));
1848   if (flg) {
1849     Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
1850     const PetscScalar *u,*v,*uu,*vv;
1851     PetscInt          ldu,ldv;
1852     PetscMPIInt       size;
1853 #if defined(H2OPUS_USE_MPI)
1854     h2opusHandle_t    handle = a->handle->handle;
1855 #else
1856     h2opusHandle_t    handle = a->handle;
1857 #endif
1858     PetscBool         usesf = (PetscBool)(a->sf && !a->nativemult);
1859     PetscSF           usf,vsf;
1860 
1861     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1862     PetscCheck(size <= 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet implemented in parallel");
1863     PetscCall(PetscLogEventBegin(MAT_H2Opus_LR,A,0,0,0));
1864     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U,&flg,MATSEQDENSE,MATMPIDENSE,""));
1865     PetscCheck(flg,PetscObjectComm((PetscObject)U),PETSC_ERR_SUP,"Not for U of type %s",((PetscObject)U)->type_name);
1866     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V,&flg,MATSEQDENSE,MATMPIDENSE,""));
1867     PetscCheck(flg,PetscObjectComm((PetscObject)V),PETSC_ERR_SUP,"Not for V of type %s",((PetscObject)V)->type_name);
1868     PetscCall(MatDenseGetLDA(U,&ldu));
1869     PetscCall(MatDenseGetLDA(V,&ldv));
1870     PetscCall(MatBoundToCPU(A,&flg));
1871     if (usesf) {
1872       PetscInt n;
1873 
1874       PetscCall(MatDenseGetH2OpusVectorSF(U,a->sf,&usf));
1875       PetscCall(MatDenseGetH2OpusVectorSF(V,a->sf,&vsf));
1876       PetscCall(MatH2OpusResizeBuffers_Private(A,U->cmap->N,V->cmap->N));
1877       PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL));
1878       ldu = n;
1879       ldv = n;
1880     }
1881     if (flg) {
1882       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1883       PetscCall(MatDenseGetArrayRead(U,&u));
1884       PetscCall(MatDenseGetArrayRead(V,&v));
1885       if (usesf) {
1886         vv = MatH2OpusGetThrustPointer(*a->yy);
1887         PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1888         PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1889         if (U != V) {
1890           uu = MatH2OpusGetThrustPointer(*a->xx);
1891           PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1892           PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1893         } else uu = vv;
1894       } else { uu = u; vv = v; }
1895       hlru_global(*a->hmatrix,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1896       PetscCall(MatDenseRestoreArrayRead(U,&u));
1897       PetscCall(MatDenseRestoreArrayRead(V,&v));
1898     } else {
1899 #if defined(PETSC_H2OPUS_USE_GPU)
1900       PetscBool flgU, flgV;
1901 
1902       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1903       PetscCall(PetscObjectTypeCompareAny((PetscObject)U,&flgU,MATSEQDENSE,MATMPIDENSE,""));
1904       if (flgU) PetscCall(MatConvert(U,MATDENSECUDA,MAT_INPLACE_MATRIX,&U));
1905       PetscCall(PetscObjectTypeCompareAny((PetscObject)V,&flgV,MATSEQDENSE,MATMPIDENSE,""));
1906       if (flgV) PetscCall(MatConvert(V,MATDENSECUDA,MAT_INPLACE_MATRIX,&V));
1907       PetscCall(MatDenseCUDAGetArrayRead(U,&u));
1908       PetscCall(MatDenseCUDAGetArrayRead(V,&v));
1909       if (usesf) {
1910         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1911         PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1912         PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1913         if (U != V) {
1914           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1915           PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1916           PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1917         } else uu = vv;
1918       } else { uu = u; vv = v; }
1919 #else
1920       SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"This should not happen");
1921 #endif
1922       hlru_global(*a->hmatrix_gpu,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1923 #if defined(PETSC_H2OPUS_USE_GPU)
1924       PetscCall(MatDenseCUDARestoreArrayRead(U,&u));
1925       PetscCall(MatDenseCUDARestoreArrayRead(V,&v));
1926       if (flgU) PetscCall(MatConvert(U,MATDENSE,MAT_INPLACE_MATRIX,&U));
1927       if (flgV) PetscCall(MatConvert(V,MATDENSE,MAT_INPLACE_MATRIX,&V));
1928 #endif
1929     }
1930     PetscCall(PetscLogEventEnd(MAT_H2Opus_LR,A,0,0,0));
1931     a->orthogonal = PETSC_FALSE;
1932   }
1933   PetscFunctionReturn(0);
1934 }
1935 #endif
1936