xref: /petsc/src/mat/impls/h2opus/cuda/math2opus.cu (revision 036e4bdc5ff4e9428be71571d75dc99dc1cdff13)
1 #include <h2opusconf.h>
2 /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
3 #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU)
4 #include <h2opus.h>
5 #if defined(H2OPUS_USE_MPI)
6 #include <h2opus/distributed/distributed_h2opus_handle.h>
7 #include <h2opus/distributed/distributed_geometric_construction.h>
8 #include <h2opus/distributed/distributed_hgemv.h>
9 #include <h2opus/distributed/distributed_horthog.h>
10 #include <h2opus/distributed/distributed_hcompress.h>
11 #endif
12 #include <h2opus/util/boxentrygen.h>
13 #include <petsc/private/matimpl.h>
14 #include <petsc/private/vecimpl.h>
15 #include <petsc/private/deviceimpl.h>
16 #include <petscsf.h>
17 
18 /* math2opusutils */
19 PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*);
20 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat,PetscSF,PetscSF*);
21 PETSC_INTERN PetscErrorCode VecSign(Vec,Vec);
22 PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt);
23 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*);
24 
25 #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
26 
27 /* Use GPU only if H2OPUS is configured for GPU */
28 #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
29 #define PETSC_H2OPUS_USE_GPU
30 #endif
31 #if defined(PETSC_H2OPUS_USE_GPU)
32 #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B)))
33 #else
34 #define MatH2OpusUpdateIfNeeded(A,B) 0
35 #endif
36 
37 // TODO H2OPUS:
38 // DistributedHMatrix
39 //   unsymmetric ?
40 //   transpose for distributed_hgemv?
41 //   clearData()
42 // Unify interface for sequential and parallel?
43 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
44 //
45 template <class T> class PetscPointCloud : public H2OpusDataSet<T>
46 {
47   private:
48     int dimension;
49     size_t num_points;
50     std::vector<T> pts;
51 
52   public:
53     PetscPointCloud(int dim, size_t num_pts, const T coords[])
54     {
55       dim = dim > 0 ? dim : 1;
56       this->dimension = dim;
57       this->num_points = num_pts;
58 
59       pts.resize(num_pts*dim);
60       if (coords) {
61         for (size_t n = 0; n < num_pts; n++)
62           for (int i = 0; i < dim; i++)
63             pts[n*dim + i] = coords[n*dim + i];
64       } else {
65         PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
66         for (size_t n = 0; n < num_pts; n++) {
67           pts[n*dim] = n*h;
68           for (int i = 1; i < dim; i++)
69             pts[n*dim + i] = 0.0;
70         }
71       }
72     }
73 
74     PetscPointCloud(const PetscPointCloud<T>& other)
75     {
76       size_t N = other.dimension * other.num_points;
77       this->dimension = other.dimension;
78       this->num_points = other.num_points;
79       this->pts.resize(N);
80       for (size_t i = 0; i < N; i++)
81         this->pts[i] = other.pts[i];
82     }
83 
84     int getDimension() const
85     {
86         return dimension;
87     }
88 
89     size_t getDataSetSize() const
90     {
91         return num_points;
92     }
93 
94     T getDataPoint(size_t idx, int dim) const
95     {
96         assert(dim < dimension && idx < num_points);
97         return pts[idx*dimension + dim];
98     }
99 
100     void Print(std::ostream& out = std::cout)
101     {
102        out << "Dimension: " << dimension << std::endl;
103        out << "NumPoints: " << num_points << std::endl;
104        for (size_t n = 0; n < num_points; n++) {
105          for (int d = 0; d < dimension; d++)
106            out << pts[n*dimension + d] << " ";
107          out << std::endl;
108        }
109     }
110 };
111 
112 template<class T> class PetscFunctionGenerator
113 {
114 private:
115   MatH2OpusKernel k;
116   int             dim;
117   void            *ctx;
118 
119 public:
120     PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; }
121     PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; }
122     T operator()(PetscReal *pt1, PetscReal *pt2)
123     {
124         return (T)((*this->k)(this->dim,pt1,pt2,this->ctx));
125     }
126 };
127 
128 #include <../src/mat/impls/h2opus/math2opussampler.hpp>
129 
130 /* just to not clutter the code */
131 #if !defined(H2OPUS_USE_GPU)
132 typedef HMatrix HMatrix_GPU;
133 #if defined(H2OPUS_USE_MPI)
134 typedef DistributedHMatrix DistributedHMatrix_GPU;
135 #endif
136 #endif
137 
138 typedef struct {
139 #if defined(H2OPUS_USE_MPI)
140   distributedH2OpusHandle_t handle;
141 #else
142   h2opusHandle_t handle;
143 #endif
144   /* Sequential and parallel matrices are two different classes at the moment */
145   HMatrix *hmatrix;
146 #if defined(H2OPUS_USE_MPI)
147   DistributedHMatrix *dist_hmatrix;
148 #else
149   HMatrix *dist_hmatrix; /* just to not clutter the code */
150 #endif
151   /* May use permutations */
152   PetscSF sf;
153   PetscLayout h2opus_rmap, h2opus_cmap;
154   IS h2opus_indexmap;
155   thrust::host_vector<PetscScalar> *xx,*yy;
156   PetscInt xxs,yys;
157   PetscBool multsetup;
158 
159   /* GPU */
160   HMatrix_GPU *hmatrix_gpu;
161 #if defined(H2OPUS_USE_MPI)
162   DistributedHMatrix_GPU *dist_hmatrix_gpu;
163 #else
164   HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
165 #endif
166 #if defined(PETSC_H2OPUS_USE_GPU)
167   thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
168   PetscInt xxs_gpu,yys_gpu;
169 #endif
170 
171   /* construction from matvecs */
172   PetscMatrixSampler* sampler;
173   PetscBool nativemult;
174 
175   /* Admissibility */
176   PetscReal eta;
177   PetscInt  leafsize;
178 
179   /* for dof reordering */
180   PetscPointCloud<PetscReal> *ptcloud;
181 
182   /* kernel for generating matrix entries */
183   PetscFunctionGenerator<PetscScalar> *kernel;
184 
185   /* basis orthogonalized? */
186   PetscBool orthogonal;
187 
188   /* customization */
189   PetscInt  basisord;
190   PetscInt  max_rank;
191   PetscInt  bs;
192   PetscReal rtol;
193   PetscInt  norm_max_samples;
194   PetscBool check_construction;
195   PetscBool hara_verbose;
196   PetscBool resize;
197 
198   /* keeps track of MatScale values */
199   PetscScalar s;
200 } Mat_H2OPUS;
201 
202 static PetscErrorCode MatDestroy_H2OPUS(Mat A)
203 {
204   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
205 
206   PetscFunctionBegin;
207 #if defined(H2OPUS_USE_MPI)
208   h2opusDestroyDistributedHandle(a->handle);
209 #else
210   h2opusDestroyHandle(a->handle);
211 #endif
212   delete a->dist_hmatrix;
213   delete a->hmatrix;
214   PetscCall(PetscSFDestroy(&a->sf));
215   PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
216   PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
217   PetscCall(ISDestroy(&a->h2opus_indexmap));
218   delete a->xx;
219   delete a->yy;
220   delete a->hmatrix_gpu;
221   delete a->dist_hmatrix_gpu;
222 #if defined(PETSC_H2OPUS_USE_GPU)
223   delete a->xx_gpu;
224   delete a->yy_gpu;
225 #endif
226   delete a->sampler;
227   delete a->ptcloud;
228   delete a->kernel;
229   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL));
230   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL));
231   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL));
232   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL));
233   PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL));
234   PetscCall(PetscFree(A->data));
235   PetscFunctionReturn(0);
236 }
237 
238 PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
239 {
240   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
241   PetscBool      ish2opus;
242 
243   PetscFunctionBegin;
244   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
245   PetscValidLogicalCollectiveBool(A,nm,2);
246   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
247   if (ish2opus) {
248     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
249       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
250         PetscLayout t;
251         t = A->rmap;
252         A->rmap = a->h2opus_rmap;
253         a->h2opus_rmap = t;
254         t = A->cmap;
255         A->cmap = a->h2opus_cmap;
256         a->h2opus_cmap = t;
257       }
258     }
259     a->nativemult = nm;
260   }
261   PetscFunctionReturn(0);
262 }
263 
264 PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
265 {
266   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
267   PetscBool      ish2opus;
268 
269   PetscFunctionBegin;
270   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
271   PetscValidPointer(nm,2);
272   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
273   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
274   *nm = a->nativemult;
275   PetscFunctionReturn(0);
276 }
277 
278 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n)
279 {
280   PetscBool      ish2opus;
281   PetscInt       nmax = PETSC_DECIDE;
282   Mat_H2OPUS     *a = NULL;
283   PetscBool      mult = PETSC_FALSE;
284 
285   PetscFunctionBegin;
286   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
287   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
288     a = (Mat_H2OPUS*)A->data;
289 
290     nmax = a->norm_max_samples;
291     mult = a->nativemult;
292     PetscCall(MatH2OpusSetNativeMult(A,PETSC_TRUE));
293   } else {
294     PetscCall(PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL));
295   }
296   PetscCall(MatApproximateNorm_Private(A,normtype,nmax,n));
297   if (a) PetscCall(MatH2OpusSetNativeMult(A,mult));
298   PetscFunctionReturn(0);
299 }
300 
301 static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
302 {
303   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
304   PetscInt       n;
305   PetscBool      boundtocpu = PETSC_TRUE;
306 
307   PetscFunctionBegin;
308 #if defined(PETSC_H2OPUS_USE_GPU)
309   boundtocpu = A->boundtocpu;
310 #endif
311   PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
312   if (boundtocpu) {
313     if (h2opus->xxs < xN) { h2opus->xx->resize(n*xN); h2opus->xxs = xN; }
314     if (h2opus->yys < yN) { h2opus->yy->resize(n*yN); h2opus->yys = yN; }
315   }
316 #if defined(PETSC_H2OPUS_USE_GPU)
317   if (!boundtocpu) {
318     if (h2opus->xxs_gpu < xN) { h2opus->xx_gpu->resize(n*xN); h2opus->xxs_gpu = xN; }
319     if (h2opus->yys_gpu < yN) { h2opus->yy_gpu->resize(n*yN); h2opus->yys_gpu = yN; }
320   }
321 #endif
322   PetscFunctionReturn(0);
323 }
324 
325 static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
326 {
327   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
328 #if defined(H2OPUS_USE_MPI)
329   h2opusHandle_t handle = h2opus->handle->handle;
330 #else
331   h2opusHandle_t handle = h2opus->handle;
332 #endif
333   PetscBool      boundtocpu = PETSC_TRUE;
334   PetscScalar    *xx,*yy,*uxx,*uyy;
335   PetscInt       blda,clda;
336   PetscMPIInt    size;
337   PetscSF        bsf,csf;
338   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
339 
340   PetscFunctionBegin;
341   HLibProfile::clear();
342 #if defined(PETSC_H2OPUS_USE_GPU)
343   boundtocpu = A->boundtocpu;
344 #endif
345   PetscCall(MatDenseGetLDA(B,&blda));
346   PetscCall(MatDenseGetLDA(C,&clda));
347   if (usesf) {
348     PetscInt n;
349 
350     PetscCall(MatDenseGetH2OpusVectorSF(B,h2opus->sf,&bsf));
351     PetscCall(MatDenseGetH2OpusVectorSF(C,h2opus->sf,&csf));
352 
353     PetscCall(MatH2OpusResizeBuffers_Private(A,B->cmap->N,C->cmap->N));
354     PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
355     blda = n;
356     clda = n;
357   }
358   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
359   if (boundtocpu) {
360     PetscCall(MatDenseGetArrayRead(B,(const PetscScalar**)&xx));
361     PetscCall(MatDenseGetArrayWrite(C,&yy));
362     if (usesf) {
363       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx);
364       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy);
365       PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
366       PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
367     } else {
368       uxx = xx;
369       uyy = yy;
370     }
371     if (size > 1) {
372       PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
373       PetscCheck(!transA || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
374 #if defined(H2OPUS_USE_MPI)
375       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
376 #endif
377     } else {
378       PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
379       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
380     }
381     PetscCall(MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx));
382     if (usesf) {
383       PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
384       PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
385     }
386     PetscCall(MatDenseRestoreArrayWrite(C,&yy));
387 #if defined(PETSC_H2OPUS_USE_GPU)
388   } else {
389     PetscBool ciscuda,biscuda;
390 
391     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
392     PetscCall(PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
393     if (!biscuda) {
394       PetscCall(MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B));
395     }
396     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
397     if (!ciscuda) {
398       C->assembled = PETSC_TRUE;
399       PetscCall(MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C));
400     }
401     PetscCall(MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx));
402     PetscCall(MatDenseCUDAGetArrayWrite(C,&yy));
403     if (usesf) {
404       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
405       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
406       PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
407       PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
408     } else {
409       uxx = xx;
410       uyy = yy;
411     }
412     PetscCall(PetscLogGpuTimeBegin());
413     if (size > 1) {
414       PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
415       PetscCheck(!transA || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
416 #if defined(H2OPUS_USE_MPI)
417       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
418 #endif
419     } else {
420       PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
421       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
422     }
423     PetscCall(PetscLogGpuTimeEnd());
424     PetscCall(MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx));
425     if (usesf) {
426       PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
427       PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
428     }
429     PetscCall(MatDenseCUDARestoreArrayWrite(C,&yy));
430     if (!biscuda) {
431       PetscCall(MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B));
432     }
433     if (!ciscuda) {
434       PetscCall(MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C));
435     }
436 #endif
437   }
438   { /* log flops */
439     double gops,time,perf,dev;
440     HLibProfile::getHgemvPerf(gops,time,perf,dev);
441 #if defined(PETSC_H2OPUS_USE_GPU)
442     if (boundtocpu) {
443       PetscCall(PetscLogFlops(1e9*gops));
444     } else {
445       PetscCall(PetscLogGpuFlops(1e9*gops));
446     }
447 #else
448     PetscCall(PetscLogFlops(1e9*gops));
449 #endif
450   }
451   PetscFunctionReturn(0);
452 }
453 
454 static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
455 {
456   Mat_Product    *product = C->product;
457 
458   PetscFunctionBegin;
459   MatCheckProduct(C,1);
460   switch (product->type) {
461   case MATPRODUCT_AB:
462     PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C));
463     break;
464   case MATPRODUCT_AtB:
465     PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C));
466     break;
467   default:
468     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
469   }
470   PetscFunctionReturn(0);
471 }
472 
473 static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
474 {
475   Mat_Product    *product = C->product;
476   PetscBool      cisdense;
477   Mat            A,B;
478 
479   PetscFunctionBegin;
480   MatCheckProduct(C,1);
481   A = product->A;
482   B = product->B;
483   switch (product->type) {
484   case MATPRODUCT_AB:
485     PetscCall(MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N));
486     PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B));
487     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
488     if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name));
489     PetscCall(MatSetUp(C));
490     break;
491   case MATPRODUCT_AtB:
492     PetscCall(MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N));
493     PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B));
494     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
495     if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name));
496     PetscCall(MatSetUp(C));
497     break;
498   default:
499     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
500   }
501   C->ops->productsymbolic = NULL;
502   C->ops->productnumeric = MatProductNumeric_H2OPUS;
503   PetscFunctionReturn(0);
504 }
505 
506 static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
507 {
508   PetscFunctionBegin;
509   MatCheckProduct(C,1);
510   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
511     C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
512   }
513   PetscFunctionReturn(0);
514 }
515 
516 static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
517 {
518   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
519 #if defined(H2OPUS_USE_MPI)
520   h2opusHandle_t handle = h2opus->handle->handle;
521 #else
522   h2opusHandle_t handle = h2opus->handle;
523 #endif
524   PetscBool      boundtocpu = PETSC_TRUE;
525   PetscInt       n;
526   PetscScalar    *xx,*yy,*uxx,*uyy;
527   PetscMPIInt    size;
528   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
529 
530   PetscFunctionBegin;
531   HLibProfile::clear();
532   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
533 #if defined(PETSC_H2OPUS_USE_GPU)
534   boundtocpu = A->boundtocpu;
535 #endif
536   if (usesf) {
537     PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
538   } else n = A->rmap->n;
539   if (boundtocpu) {
540     PetscCall(VecGetArrayRead(x,(const PetscScalar**)&xx));
541     if (sy == 0.0) {
542       PetscCall(VecGetArrayWrite(y,&yy));
543     } else {
544       PetscCall(VecGetArray(y,&yy));
545     }
546     if (usesf) {
547       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
548       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
549 
550       PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
551       PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
552       if (sy != 0.0) {
553         PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
554         PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
555       }
556     } else {
557       uxx = xx;
558       uyy = yy;
559     }
560     if (size > 1) {
561       PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
562       PetscCheck(!trans || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
563 #if defined(H2OPUS_USE_MPI)
564       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
565 #endif
566     } else {
567       PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
568       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
569     }
570     PetscCall(VecRestoreArrayRead(x,(const PetscScalar**)&xx));
571     if (usesf) {
572       PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
573       PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
574     }
575     if (sy == 0.0) {
576       PetscCall(VecRestoreArrayWrite(y,&yy));
577     } else {
578       PetscCall(VecRestoreArray(y,&yy));
579     }
580 #if defined(PETSC_H2OPUS_USE_GPU)
581   } else {
582     PetscCall(VecCUDAGetArrayRead(x,(const PetscScalar**)&xx));
583     if (sy == 0.0) {
584       PetscCall(VecCUDAGetArrayWrite(y,&yy));
585     } else {
586       PetscCall(VecCUDAGetArray(y,&yy));
587     }
588     if (usesf) {
589       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
590       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
591 
592       PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
593       PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
594       if (sy != 0.0) {
595         PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
596         PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
597       }
598     } else {
599       uxx = xx;
600       uyy = yy;
601     }
602     PetscCall(PetscLogGpuTimeBegin());
603     if (size > 1) {
604       PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
605       PetscCheck(!trans || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
606 #if defined(H2OPUS_USE_MPI)
607       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
608 #endif
609     } else {
610       PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
611       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
612     }
613     PetscCall(PetscLogGpuTimeEnd());
614     PetscCall(VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx));
615     if (usesf) {
616       PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
617       PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
618     }
619     if (sy == 0.0) {
620       PetscCall(VecCUDARestoreArrayWrite(y,&yy));
621     } else {
622       PetscCall(VecCUDARestoreArray(y,&yy));
623     }
624 #endif
625   }
626   { /* log flops */
627     double gops,time,perf,dev;
628     HLibProfile::getHgemvPerf(gops,time,perf,dev);
629 #if defined(PETSC_H2OPUS_USE_GPU)
630     if (boundtocpu) {
631       PetscCall(PetscLogFlops(1e9*gops));
632     } else {
633       PetscCall(PetscLogGpuFlops(1e9*gops));
634     }
635 #else
636     PetscCall(PetscLogFlops(1e9*gops));
637 #endif
638   }
639   PetscFunctionReturn(0);
640 }
641 
642 static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
643 {
644   PetscBool      xiscuda,yiscuda;
645 
646   PetscFunctionBegin;
647   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
648   PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,""));
649   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda));
650   PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE));
651   PetscFunctionReturn(0);
652 }
653 
654 static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
655 {
656   PetscBool      xiscuda,yiscuda;
657 
658   PetscFunctionBegin;
659   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
660   PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,""));
661   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda));
662   PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE));
663   PetscFunctionReturn(0);
664 }
665 
666 static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
667 {
668   PetscBool      xiscuda,ziscuda;
669 
670   PetscFunctionBegin;
671   PetscCall(VecCopy(y,z));
672   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
673   PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,""));
674   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda));
675   PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE));
676   PetscFunctionReturn(0);
677 }
678 
679 static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
680 {
681   PetscBool      xiscuda,ziscuda;
682 
683   PetscFunctionBegin;
684   PetscCall(VecCopy(y,z));
685   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
686   PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,""));
687   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda));
688   PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE));
689   PetscFunctionReturn(0);
690 }
691 
692 static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
693 {
694   Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
695 
696   PetscFunctionBegin;
697   a->s *= s;
698   PetscFunctionReturn(0);
699 }
700 
701 static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A)
702 {
703   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
704 
705   PetscFunctionBegin;
706   PetscOptionsHeadBegin(PetscOptionsObject,"H2OPUS options");
707   PetscCall(PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL));
708   PetscCall(PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL));
709   PetscCall(PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL));
710   PetscCall(PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL));
711   PetscCall(PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL));
712   PetscCall(PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL));
713   PetscCall(PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL));
714   PetscCall(PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL));
715   PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL));
716   PetscCall(PetscOptionsBool("-mat_h2opus_resize","Resize after compression",NULL,a->resize,&a->resize,NULL));
717   PetscOptionsHeadEnd();
718   PetscFunctionReturn(0);
719 }
720 
721 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*);
722 
723 static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
724 {
725   Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
726   Vec               c;
727   PetscInt          spacedim;
728   const PetscScalar *coords;
729 
730   PetscFunctionBegin;
731   if (a->ptcloud) PetscFunctionReturn(0);
732   PetscCall(PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c));
733   if (!c && a->sampler) {
734     Mat S = a->sampler->GetSamplingMat();
735 
736     PetscCall(PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c));
737   }
738   if (!c) {
739     PetscCall(MatH2OpusSetCoords_H2OPUS(A,-1,NULL,PETSC_FALSE,NULL,NULL));
740   } else {
741     PetscCall(VecGetArrayRead(c,&coords));
742     PetscCall(VecGetBlockSize(c,&spacedim));
743     PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL));
744     PetscCall(VecRestoreArrayRead(c,&coords));
745   }
746   PetscFunctionReturn(0);
747 }
748 
749 static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
750 {
751   MPI_Comm       comm;
752   PetscMPIInt    size;
753   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
754   PetscInt       n = 0,*idx = NULL;
755   int            *iidx = NULL;
756   PetscCopyMode  own;
757   PetscBool      rid;
758 
759   PetscFunctionBegin;
760   if (a->multsetup) PetscFunctionReturn(0);
761   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
762     PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL));
763 #if defined(PETSC_H2OPUS_USE_GPU)
764     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
765     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
766     a->xxs_gpu = 1;
767     a->yys_gpu = 1;
768 #endif
769     a->xx  = new thrust::host_vector<PetscScalar>(n);
770     a->yy  = new thrust::host_vector<PetscScalar>(n);
771     a->xxs = 1;
772     a->yys = 1;
773   } else {
774     IS is;
775     PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
776     PetscCallMPI(MPI_Comm_size(comm,&size));
777     if (!a->h2opus_indexmap) {
778       if (size > 1) {
779         PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
780 #if defined(H2OPUS_USE_MPI)
781         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
782         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
783 #endif
784       } else {
785         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
786         n    = a->hmatrix->u_basis_tree.index_map.size();
787       }
788 
789       if (PetscDefined(USE_64BIT_INDICES)) {
790         PetscInt i;
791 
792         own  = PETSC_OWN_POINTER;
793         PetscCall(PetscMalloc1(n,&idx));
794         for (i=0;i<n;i++) idx[i] = iidx[i];
795       } else {
796         own  = PETSC_COPY_VALUES;
797         idx  = (PetscInt*)iidx;
798       }
799       PetscCall(ISCreateGeneral(comm,n,idx,own,&is));
800       PetscCall(ISSetPermutation(is));
801       PetscCall(ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view"));
802       a->h2opus_indexmap = is;
803     }
804     PetscCall(ISGetLocalSize(a->h2opus_indexmap,&n));
805     PetscCall(ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx));
806     rid  = (PetscBool)(n == A->rmap->n);
807     PetscCall(MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm));
808     if (rid) {
809       PetscCall(ISIdentity(a->h2opus_indexmap,&rid));
810     }
811     if (!rid) {
812       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
813         PetscCall(PetscLayoutCreate(comm,&a->h2opus_rmap));
814         PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap,n));
815         PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
816         PetscCall(PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap));
817       }
818       PetscCall(PetscSFCreate(comm,&a->sf));
819       PetscCall(PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx));
820       PetscCall(PetscSFSetUp(a->sf));
821       PetscCall(PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view"));
822 #if defined(PETSC_H2OPUS_USE_GPU)
823       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
824       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
825       a->xxs_gpu = 1;
826       a->yys_gpu = 1;
827 #endif
828       a->xx  = new thrust::host_vector<PetscScalar>(n);
829       a->yy  = new thrust::host_vector<PetscScalar>(n);
830       a->xxs = 1;
831       a->yys = 1;
832     }
833     PetscCall(ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx));
834   }
835   a->multsetup = PETSC_TRUE;
836   PetscFunctionReturn(0);
837 }
838 
839 static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
840 {
841   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
842 #if defined(H2OPUS_USE_MPI)
843   h2opusHandle_t handle = a->handle->handle;
844 #else
845   h2opusHandle_t handle = a->handle;
846 #endif
847   PetscBool      kernel = PETSC_FALSE;
848   PetscBool      boundtocpu = PETSC_TRUE;
849   PetscBool      samplingdone = PETSC_FALSE;
850   MPI_Comm       comm;
851   PetscMPIInt    size;
852 
853   PetscFunctionBegin;
854   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
855   PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
856   PetscCheck(A->rmap->N == A->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
857 
858   /* XXX */
859   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
860 
861   PetscCallMPI(MPI_Comm_size(comm,&size));
862   /* TODO REUSABILITY of geometric construction */
863   delete a->hmatrix;
864   delete a->dist_hmatrix;
865 #if defined(PETSC_H2OPUS_USE_GPU)
866   delete a->hmatrix_gpu;
867   delete a->dist_hmatrix_gpu;
868 #endif
869   a->orthogonal = PETSC_FALSE;
870 
871   /* TODO: other? */
872   H2OpusBoxCenterAdmissibility adm(a->eta);
873 
874   PetscCall(PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0));
875   if (size > 1) {
876 #if defined(H2OPUS_USE_MPI)
877     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/* ,A->symmetric */);
878 #else
879     a->dist_hmatrix = NULL;
880 #endif
881   } else {
882     a->hmatrix = new HMatrix(A->rmap->n,A->symmetric);
883   }
884   PetscCall(MatH2OpusInferCoordinates_Private(A));
885   PetscCheck(a->ptcloud,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
886   if (a->kernel) {
887     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
888     if (size > 1) {
889       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
890 #if defined(H2OPUS_USE_MPI)
891       buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle);
892 #endif
893     } else {
894       buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord);
895     }
896     kernel = PETSC_TRUE;
897   } else {
898     PetscCheck(size <= 1,comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
899     buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm);
900   }
901   PetscCall(MatSetUpMultiply_H2OPUS(A));
902 
903 #if defined(PETSC_H2OPUS_USE_GPU)
904   boundtocpu = A->boundtocpu;
905   if (!boundtocpu) {
906     if (size > 1) {
907       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
908 #if defined(H2OPUS_USE_MPI)
909       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
910 #endif
911     } else {
912       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
913     }
914   }
915 #endif
916   if (size == 1) {
917     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
918       PetscReal Anorm;
919       bool      verbose;
920 
921       PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL));
922       verbose = a->hara_verbose;
923       PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm));
924       if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs));
925       if (a->sf && !a->nativemult) {
926         a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data());
927       }
928       a->sampler->SetStream(handle->getMainStream());
929       if (boundtocpu) {
930         a->sampler->SetGPUSampling(false);
931         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
932 #if defined(PETSC_H2OPUS_USE_GPU)
933       } else {
934         a->sampler->SetGPUSampling(true);
935         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
936 #endif
937       }
938       samplingdone = PETSC_TRUE;
939     }
940   }
941 #if defined(PETSC_H2OPUS_USE_GPU)
942   if (!boundtocpu) {
943     delete a->hmatrix;
944     delete a->dist_hmatrix;
945     a->hmatrix = NULL;
946     a->dist_hmatrix = NULL;
947   }
948   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
949 #endif
950   PetscCall(PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0));
951 
952   if (!a->s) a->s = 1.0;
953   A->assembled = PETSC_TRUE;
954 
955   if (samplingdone) {
956     PetscBool check = a->check_construction;
957     PetscBool checke = PETSC_FALSE;
958 
959     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL));
960     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL));
961     if (check) {
962       Mat       E,Ae;
963       PetscReal n1,ni,n2;
964       PetscReal n1A,niA,n2A;
965       void      (*normfunc)(void);
966 
967       Ae   = a->sampler->GetSamplingMat();
968       PetscCall(MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E));
969       PetscCall(MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
970       PetscCall(MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN));
971       PetscCall(MatNorm(E,NORM_1,&n1));
972       PetscCall(MatNorm(E,NORM_INFINITY,&ni));
973       PetscCall(MatNorm(E,NORM_2,&n2));
974       if (checke) {
975         Mat eA,eE,eAe;
976 
977         PetscCall(MatComputeOperator(A,MATAIJ,&eA));
978         PetscCall(MatComputeOperator(E,MATAIJ,&eE));
979         PetscCall(MatComputeOperator(Ae,MATAIJ,&eAe));
980         PetscCall(MatChop(eA,PETSC_SMALL));
981         PetscCall(MatChop(eE,PETSC_SMALL));
982         PetscCall(MatChop(eAe,PETSC_SMALL));
983         PetscCall(PetscObjectSetName((PetscObject)eA,"H2Mat"));
984         PetscCall(MatView(eA,NULL));
985         PetscCall(PetscObjectSetName((PetscObject)eAe,"S"));
986         PetscCall(MatView(eAe,NULL));
987         PetscCall(PetscObjectSetName((PetscObject)eE,"H2Mat - S"));
988         PetscCall(MatView(eE,NULL));
989         PetscCall(MatDestroy(&eA));
990         PetscCall(MatDestroy(&eE));
991         PetscCall(MatDestroy(&eAe));
992       }
993 
994       PetscCall(MatGetOperation(Ae,MATOP_NORM,&normfunc));
995       PetscCall(MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
996       PetscCall(MatNorm(Ae,NORM_1,&n1A));
997       PetscCall(MatNorm(Ae,NORM_INFINITY,&niA));
998       PetscCall(MatNorm(Ae,NORM_2,&n2A));
999       n1A  = PetscMax(n1A,PETSC_SMALL);
1000       n2A  = PetscMax(n2A,PETSC_SMALL);
1001       niA  = PetscMax(niA,PETSC_SMALL);
1002       PetscCall(MatSetOperation(Ae,MATOP_NORM,normfunc));
1003       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A)));
1004       PetscCall(MatDestroy(&E));
1005     }
1006     a->sampler->SetSamplingMat(NULL);
1007   }
1008   PetscFunctionReturn(0);
1009 }
1010 
1011 static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1012 {
1013   PetscMPIInt    size;
1014   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1015 
1016   PetscFunctionBegin;
1017   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1018   PetscCheck(size <= 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported");
1019   else {
1020     a->hmatrix->clearData();
1021 #if defined(PETSC_H2OPUS_USE_GPU)
1022     if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1023 #endif
1024   }
1025   PetscFunctionReturn(0);
1026 }
1027 
1028 static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1029 {
1030   Mat            A;
1031   Mat_H2OPUS     *a, *b = (Mat_H2OPUS*)B->data;
1032 #if defined(PETSC_H2OPUS_USE_GPU)
1033   PetscBool      iscpu = PETSC_FALSE;
1034 #else
1035   PetscBool      iscpu = PETSC_TRUE;
1036 #endif
1037   MPI_Comm       comm;
1038 
1039   PetscFunctionBegin;
1040   PetscCall(PetscObjectGetComm((PetscObject)B,&comm));
1041   PetscCall(MatCreate(comm,&A));
1042   PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N));
1043   PetscCall(MatSetType(A,MATH2OPUS));
1044   PetscCall(MatPropagateSymmetryOptions(B,A));
1045   a = (Mat_H2OPUS*)A->data;
1046 
1047   a->eta              = b->eta;
1048   a->leafsize         = b->leafsize;
1049   a->basisord         = b->basisord;
1050   a->max_rank         = b->max_rank;
1051   a->bs               = b->bs;
1052   a->rtol             = b->rtol;
1053   a->norm_max_samples = b->norm_max_samples;
1054   if (op == MAT_COPY_VALUES) a->s = b->s;
1055 
1056   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1057   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1058 
1059 #if defined(H2OPUS_USE_MPI)
1060   if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1061 #if defined(PETSC_H2OPUS_USE_GPU)
1062   if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1063 #endif
1064 #endif
1065   if (b->hmatrix) {
1066     a->hmatrix = new HMatrix(*b->hmatrix);
1067     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1068   }
1069 #if defined(PETSC_H2OPUS_USE_GPU)
1070   if (b->hmatrix_gpu) {
1071     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1072     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1073   }
1074 #endif
1075   if (b->sf) {
1076     PetscCall(PetscObjectReference((PetscObject)b->sf));
1077     a->sf = b->sf;
1078   }
1079   if (b->h2opus_indexmap) {
1080     PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1081     a->h2opus_indexmap = b->h2opus_indexmap;
1082   }
1083 
1084   PetscCall(MatSetUp(A));
1085   PetscCall(MatSetUpMultiply_H2OPUS(A));
1086   if (op == MAT_COPY_VALUES) {
1087     A->assembled = PETSC_TRUE;
1088     a->orthogonal = b->orthogonal;
1089 #if defined(PETSC_H2OPUS_USE_GPU)
1090     A->offloadmask = B->offloadmask;
1091 #endif
1092   }
1093 #if defined(PETSC_H2OPUS_USE_GPU)
1094   iscpu = B->boundtocpu;
1095 #endif
1096   PetscCall(MatBindToCPU(A,iscpu));
1097 
1098   *nA = A;
1099   PetscFunctionReturn(0);
1100 }
1101 
1102 static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1103 {
1104   Mat_H2OPUS        *h2opus = (Mat_H2OPUS*)A->data;
1105   PetscBool         isascii, vieweps;
1106   PetscMPIInt       size;
1107   PetscViewerFormat format;
1108 
1109   PetscFunctionBegin;
1110   PetscCall(PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii));
1111   PetscCall(PetscViewerGetFormat(view,&format));
1112   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1113   if (isascii) {
1114     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1115       if (size == 1) {
1116         FILE *fp;
1117         PetscCall(PetscViewerASCIIGetPointer(view,&fp));
1118         dumpHMatrix(*h2opus->hmatrix,6,fp);
1119       }
1120     } else {
1121       PetscCall(PetscViewerASCIIPrintf(view,"  H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat"));
1122       PetscCall(PetscViewerASCIIPrintf(view,"  PointCloud dim %" PetscInt_FMT "\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1123       PetscCall(PetscViewerASCIIPrintf(view,"  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n",h2opus->leafsize,(double)h2opus->eta));
1124       if (!h2opus->kernel) {
1125         PetscCall(PetscViewerASCIIPrintf(view,"  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol));
1126       } else {
1127         PetscCall(PetscViewerASCIIPrintf(view,"  Offdiagonal blocks approximation order %" PetscInt_FMT "\n",h2opus->basisord));
1128       }
1129       PetscCall(PetscViewerASCIIPrintf(view,"  Number of samples for norms %" PetscInt_FMT "\n",h2opus->norm_max_samples));
1130       if (size == 1) {
1131         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1132         double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1133 #if defined(PETSC_HAVE_CUDA)
1134         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1135         double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1136 #endif
1137         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1138 #if defined(PETSC_HAVE_CUDA)
1139         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1140 #endif
1141       } else {
1142 #if defined(PETSC_HAVE_CUDA)
1143         double      matrix_mem[4] = {0.,0.,0.,0.};
1144         PetscMPIInt rsize = 4;
1145 #else
1146         double      matrix_mem[2] = {0.,0.};
1147         PetscMPIInt rsize = 2;
1148 #endif
1149 #if defined(H2OPUS_USE_MPI)
1150         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1151         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1152 #if defined(PETSC_HAVE_CUDA)
1153         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1154         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1155 #endif
1156 #endif
1157         PetscCall(MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A)));
1158         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1159 #if defined(PETSC_HAVE_CUDA)
1160         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1161 #endif
1162       }
1163     }
1164   }
1165   vieweps = PETSC_FALSE;
1166   PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_vieweps",&vieweps,NULL));
1167   if (vieweps) {
1168     char filename[256];
1169     const char *name;
1170 
1171     PetscCall(PetscObjectGetName((PetscObject)A,&name));
1172     PetscCall(PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name));
1173     PetscCall(PetscOptionsGetString(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_vieweps_filename",filename,sizeof(filename),NULL));
1174     outputEps(*h2opus->hmatrix,filename);
1175   }
1176   PetscFunctionReturn(0);
1177 }
1178 
1179 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1180 {
1181   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
1182   PetscReal      *gcoords;
1183   PetscInt       N;
1184   MPI_Comm       comm;
1185   PetscMPIInt    size;
1186   PetscBool      cong;
1187 
1188   PetscFunctionBegin;
1189   PetscCall(PetscLayoutSetUp(A->rmap));
1190   PetscCall(PetscLayoutSetUp(A->cmap));
1191   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
1192   PetscCall(MatHasCongruentLayouts(A,&cong));
1193   PetscCheck(cong,comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts");
1194   N = A->rmap->N;
1195   PetscCallMPI(MPI_Comm_size(comm,&size));
1196   if (spacedim > 0 && size > 1 && cdist) {
1197     PetscSF      sf;
1198     MPI_Datatype dtype;
1199 
1200     PetscCallMPI(MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype));
1201     PetscCallMPI(MPI_Type_commit(&dtype));
1202 
1203     PetscCall(PetscSFCreate(comm,&sf));
1204     PetscCall(PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER));
1205     PetscCall(PetscMalloc1(spacedim*N,&gcoords));
1206     PetscCall(PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE));
1207     PetscCall(PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE));
1208     PetscCall(PetscSFDestroy(&sf));
1209     PetscCallMPI(MPI_Type_free(&dtype));
1210   } else gcoords = (PetscReal*)coords;
1211 
1212   delete h2opus->ptcloud;
1213   delete h2opus->kernel;
1214   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords);
1215   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx);
1216   if (gcoords != coords) PetscCall(PetscFree(gcoords));
1217   A->preallocated = PETSC_TRUE;
1218   PetscFunctionReturn(0);
1219 }
1220 
1221 #if defined(PETSC_H2OPUS_USE_GPU)
1222 static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1223 {
1224   PetscMPIInt    size;
1225   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1226 
1227   PetscFunctionBegin;
1228   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1229   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1230     if (size > 1) {
1231       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1232 #if defined(H2OPUS_USE_MPI)
1233       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1234       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1235 #endif
1236     } else {
1237       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1238       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1239       else *a->hmatrix = *a->hmatrix_gpu;
1240     }
1241     delete a->hmatrix_gpu;
1242     delete a->dist_hmatrix_gpu;
1243     a->hmatrix_gpu = NULL;
1244     a->dist_hmatrix_gpu = NULL;
1245     A->offloadmask = PETSC_OFFLOAD_CPU;
1246   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1247     if (size > 1) {
1248       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1249 #if defined(H2OPUS_USE_MPI)
1250       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1251       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1252 #endif
1253     } else {
1254       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1255       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1256       else *a->hmatrix_gpu = *a->hmatrix;
1257     }
1258     delete a->hmatrix;
1259     delete a->dist_hmatrix;
1260     a->hmatrix = NULL;
1261     a->dist_hmatrix = NULL;
1262     A->offloadmask = PETSC_OFFLOAD_GPU;
1263   }
1264   PetscCall(PetscFree(A->defaultvectype));
1265   if (!flg) {
1266     PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
1267   } else {
1268     PetscCall(PetscStrallocpy(VECSTANDARD,&A->defaultvectype));
1269   }
1270   A->boundtocpu = flg;
1271   PetscFunctionReturn(0);
1272 }
1273 #endif
1274 
1275 /*MC
1276      MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.
1277 
1278    Options Database Keys:
1279 .     -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()
1280 
1281    Notes:
1282      H2Opus implements hierarchical matrices in the H^2 flavour.
1283      It supports CPU or NVIDIA GPUs.
1284      For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1285      In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.
1286      For details and additional references, see
1287        "H2Opus: A distributed-memory multi-GPU software package for non-local operators",
1288      available at https://arxiv.org/abs/2109.05451.
1289 
1290    Level: beginner
1291 
1292 .seealso: `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1293 M*/
1294 PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1295 {
1296   Mat_H2OPUS     *a;
1297   PetscMPIInt    size;
1298 
1299   PetscFunctionBegin;
1300 #if defined(PETSC_H2OPUS_USE_GPU)
1301   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1302 #endif
1303   PetscCall(PetscNewLog(A,&a));
1304   A->data = (void*)a;
1305 
1306   a->eta              = 0.9;
1307   a->leafsize         = 32;
1308   a->basisord         = 4;
1309   a->max_rank         = 64;
1310   a->bs               = 32;
1311   a->rtol             = 1.e-4;
1312   a->s                = 1.0;
1313   a->norm_max_samples = 10;
1314   a->resize           = PETSC_TRUE; /* reallocate after compression */
1315 #if defined(H2OPUS_USE_MPI)
1316   h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));
1317 #else
1318   h2opusCreateHandle(&a->handle);
1319 #endif
1320   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1321   PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS));
1322   PetscCall(PetscMemzero(A->ops,sizeof(struct _MatOps)));
1323 
1324   A->ops->destroy          = MatDestroy_H2OPUS;
1325   A->ops->view             = MatView_H2OPUS;
1326   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1327   A->ops->mult             = MatMult_H2OPUS;
1328   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1329   A->ops->multadd          = MatMultAdd_H2OPUS;
1330   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1331   A->ops->scale            = MatScale_H2OPUS;
1332   A->ops->duplicate        = MatDuplicate_H2OPUS;
1333   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1334   A->ops->norm             = MatNorm_H2OPUS;
1335   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1336 #if defined(PETSC_H2OPUS_USE_GPU)
1337   A->ops->bindtocpu        = MatBindToCPU_H2OPUS;
1338 #endif
1339 
1340   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS));
1341   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS));
1342   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS));
1343   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS));
1344 #if defined(PETSC_H2OPUS_USE_GPU)
1345   PetscCall(PetscFree(A->defaultvectype));
1346   PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
1347 #endif
1348   PetscFunctionReturn(0);
1349 }
1350 
1351 /*@C
1352      MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1353 
1354    Input Parameter:
1355 .     A - the matrix
1356 
1357    Level: intermediate
1358 
1359 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1360 @*/
1361 PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1362 {
1363   PetscBool      ish2opus;
1364   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1365   PetscMPIInt    size;
1366   PetscBool      boundtocpu = PETSC_TRUE;
1367 
1368   PetscFunctionBegin;
1369   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1370   PetscValidType(A,1);
1371   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1372   if (!ish2opus) PetscFunctionReturn(0);
1373   if (a->orthogonal) PetscFunctionReturn(0);
1374   HLibProfile::clear();
1375   PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0));
1376 #if defined(PETSC_H2OPUS_USE_GPU)
1377   boundtocpu = A->boundtocpu;
1378 #endif
1379   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1380   if (size > 1) {
1381     if (boundtocpu) {
1382       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1383 #if defined(H2OPUS_USE_MPI)
1384       distributed_horthog(*a->dist_hmatrix, a->handle);
1385 #endif
1386 #if defined(PETSC_H2OPUS_USE_GPU)
1387       A->offloadmask = PETSC_OFFLOAD_CPU;
1388     } else {
1389       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1390       PetscCall(PetscLogGpuTimeBegin());
1391 #if defined(H2OPUS_USE_MPI)
1392       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1393 #endif
1394       PetscCall(PetscLogGpuTimeEnd());
1395 #endif
1396     }
1397   } else {
1398 #if defined(H2OPUS_USE_MPI)
1399     h2opusHandle_t handle = a->handle->handle;
1400 #else
1401     h2opusHandle_t handle = a->handle;
1402 #endif
1403     if (boundtocpu) {
1404       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1405       horthog(*a->hmatrix, handle);
1406 #if defined(PETSC_H2OPUS_USE_GPU)
1407       A->offloadmask = PETSC_OFFLOAD_CPU;
1408     } else {
1409       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1410       PetscCall(PetscLogGpuTimeBegin());
1411       horthog(*a->hmatrix_gpu, handle);
1412       PetscCall(PetscLogGpuTimeEnd());
1413 #endif
1414     }
1415   }
1416   a->orthogonal = PETSC_TRUE;
1417   { /* log flops */
1418     double gops,time,perf,dev;
1419     HLibProfile::getHorthogPerf(gops,time,perf,dev);
1420 #if defined(PETSC_H2OPUS_USE_GPU)
1421     if (boundtocpu) {
1422       PetscCall(PetscLogFlops(1e9*gops));
1423     } else {
1424       PetscCall(PetscLogGpuFlops(1e9*gops));
1425     }
1426 #else
1427     PetscCall(PetscLogFlops(1e9*gops));
1428 #endif
1429   }
1430   PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0));
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 /*@C
1435      MatH2OpusCompress - Compress a hierarchical matrix.
1436 
1437    Input Parameters:
1438 +     A - the matrix
1439 -     tol - the absolute truncation threshold
1440 
1441    Level: intermediate
1442 
1443 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1444 @*/
1445 PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1446 {
1447   PetscBool      ish2opus;
1448   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1449   PetscMPIInt    size;
1450   PetscBool      boundtocpu = PETSC_TRUE;
1451 
1452   PetscFunctionBegin;
1453   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1454   PetscValidType(A,1);
1455   PetscValidLogicalCollectiveReal(A,tol,2);
1456   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1457   if (!ish2opus || tol <= 0.0) PetscFunctionReturn(0);
1458   PetscCall(MatH2OpusOrthogonalize(A));
1459   HLibProfile::clear();
1460   PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0));
1461 #if defined(PETSC_H2OPUS_USE_GPU)
1462   boundtocpu = A->boundtocpu;
1463 #endif
1464   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1465   if (size > 1) {
1466     if (boundtocpu) {
1467       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1468 #if defined(H2OPUS_USE_MPI)
1469       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1470       if (a->resize) {
1471         DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1472         delete a->dist_hmatrix;
1473         a->dist_hmatrix = dist_hmatrix;
1474       }
1475 #endif
1476 #if defined(PETSC_H2OPUS_USE_GPU)
1477       A->offloadmask = PETSC_OFFLOAD_CPU;
1478     } else {
1479       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1480       PetscCall(PetscLogGpuTimeBegin());
1481 #if defined(H2OPUS_USE_MPI)
1482       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1483 
1484       if (a->resize) {
1485         DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1486         delete a->dist_hmatrix_gpu;
1487         a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1488       }
1489 #endif
1490       PetscCall(PetscLogGpuTimeEnd());
1491 #endif
1492     }
1493   } else {
1494 #if defined(H2OPUS_USE_MPI)
1495     h2opusHandle_t handle = a->handle->handle;
1496 #else
1497     h2opusHandle_t handle = a->handle;
1498 #endif
1499     if (boundtocpu) {
1500       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1501       hcompress(*a->hmatrix, tol, handle);
1502 
1503       if (a->resize) {
1504         HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1505         delete a->hmatrix;
1506         a->hmatrix = hmatrix;
1507       }
1508 #if defined(PETSC_H2OPUS_USE_GPU)
1509       A->offloadmask = PETSC_OFFLOAD_CPU;
1510     } else {
1511       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1512       PetscCall(PetscLogGpuTimeBegin());
1513       hcompress(*a->hmatrix_gpu, tol, handle);
1514       PetscCall(PetscLogGpuTimeEnd());
1515 
1516       if (a->resize) {
1517         HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1518         delete a->hmatrix_gpu;
1519         a->hmatrix_gpu = hmatrix_gpu;
1520       }
1521 #endif
1522     }
1523   }
1524   { /* log flops */
1525     double gops,time,perf,dev;
1526     HLibProfile::getHcompressPerf(gops,time,perf,dev);
1527 #if defined(PETSC_H2OPUS_USE_GPU)
1528     if (boundtocpu) {
1529       PetscCall(PetscLogFlops(1e9*gops));
1530     } else {
1531       PetscCall(PetscLogGpuFlops(1e9*gops));
1532     }
1533 #else
1534     PetscCall(PetscLogFlops(1e9*gops));
1535 #endif
1536   }
1537   PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0));
1538   PetscFunctionReturn(0);
1539 }
1540 
1541 /*@C
1542      MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.
1543 
1544    Input Parameters:
1545 +     A - the hierarchical matrix
1546 .     B - the matrix to be sampled
1547 .     bs - maximum number of samples to be taken concurrently
1548 -     tol - relative tolerance for construction
1549 
1550    Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix.
1551 
1552    Level: intermediate
1553 
1554 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1555 @*/
1556 PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1557 {
1558   PetscBool      ish2opus;
1559 
1560   PetscFunctionBegin;
1561   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1562   PetscValidType(A,1);
1563   if (B) PetscValidHeaderSpecific(B,MAT_CLASSID,2);
1564   PetscValidLogicalCollectiveInt(A,bs,3);
1565   PetscValidLogicalCollectiveReal(A,tol,3);
1566   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1567   if (ish2opus) {
1568     Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1569 
1570     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1571     a->sampler->SetSamplingMat(B);
1572     if (bs > 0) a->bs = bs;
1573     if (tol > 0.) a->rtol = tol;
1574     delete a->kernel;
1575   }
1576   PetscFunctionReturn(0);
1577 }
1578 
1579 /*@C
1580      MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel.
1581 
1582    Input Parameters:
1583 +     comm - MPI communicator
1584 .     m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1585 .     n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1586 .     M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1587 .     N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1588 .     spacedim - dimension of the space coordinates
1589 .     coords - coordinates of the points
1590 .     cdist - whether or not coordinates are distributed
1591 .     kernel - computational kernel (or NULL)
1592 .     kernelctx - kernel context
1593 .     eta - admissibility condition tolerance
1594 .     leafsize - leaf size in cluster tree
1595 -     basisord - approximation order for Chebychev interpolation of low-rank blocks
1596 
1597    Output Parameter:
1598 .     nA - matrix
1599 
1600    Options Database Keys:
1601 +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1602 .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1603 .     -mat_h2opus_order <PetscInt> - Chebychev approximation order
1604 -     -mat_h2opus_normsamples <PetscInt> - Maximum number of samples to be used when estimating norms
1605 
1606    Level: intermediate
1607 
1608 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1609 @*/
1610 PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1611 {
1612   Mat            A;
1613   Mat_H2OPUS     *h2opus;
1614 #if defined(PETSC_H2OPUS_USE_GPU)
1615   PetscBool      iscpu = PETSC_FALSE;
1616 #else
1617   PetscBool      iscpu = PETSC_TRUE;
1618 #endif
1619 
1620   PetscFunctionBegin;
1621   PetscCheck(m == n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1622   PetscCall(MatCreate(comm,&A));
1623   PetscCall(MatSetSizes(A,m,n,M,N));
1624   PetscCheck(M == N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1625   PetscCall(MatSetType(A,MATH2OPUS));
1626   PetscCall(MatBindToCPU(A,iscpu));
1627   PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx));
1628 
1629   h2opus = (Mat_H2OPUS*)A->data;
1630   if (eta > 0.) h2opus->eta = eta;
1631   if (leafsize > 0) h2opus->leafsize = leafsize;
1632   if (basisord > 0) h2opus->basisord = basisord;
1633 
1634   *nA = A;
1635   PetscFunctionReturn(0);
1636 }
1637 
1638 /*@C
1639      MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator.
1640 
1641    Input Parameters:
1642 +     B - the matrix to be sampled
1643 .     spacedim - dimension of the space coordinates
1644 .     coords - coordinates of the points
1645 .     cdist - whether or not coordinates are distributed
1646 .     eta - admissibility condition tolerance
1647 .     leafsize - leaf size in cluster tree
1648 .     maxrank - maximum rank allowed
1649 .     bs - maximum number of samples to be taken concurrently
1650 -     rtol - relative tolerance for construction
1651 
1652    Output Parameter:
1653 .     nA - matrix
1654 
1655    Options Database Keys:
1656 +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1657 .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1658 .     -mat_h2opus_maxrank <PetscInt> - Maximum rank when constructed from matvecs
1659 .     -mat_h2opus_samples <PetscInt> - Maximum number of samples to be taken concurrently when constructing from matvecs
1660 .     -mat_h2opus_rtol <PetscReal> - Relative tolerance for construction from sampling
1661 .     -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd()
1662 .     -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction
1663 -     -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms
1664 
1665    Notes: not available in parallel
1666 
1667    Level: intermediate
1668 
1669 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1670 @*/
1671 PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1672 {
1673   Mat            A;
1674   Mat_H2OPUS     *h2opus;
1675   MPI_Comm       comm;
1676   PetscBool      boundtocpu = PETSC_TRUE;
1677 
1678   PetscFunctionBegin;
1679   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
1680   PetscValidLogicalCollectiveInt(B,spacedim,2);
1681   PetscValidLogicalCollectiveBool(B,cdist,4);
1682   PetscValidLogicalCollectiveReal(B,eta,5);
1683   PetscValidLogicalCollectiveInt(B,leafsize,6);
1684   PetscValidLogicalCollectiveInt(B,maxrank,7);
1685   PetscValidLogicalCollectiveInt(B,bs,8);
1686   PetscValidLogicalCollectiveReal(B,rtol,9);
1687   PetscValidPointer(nA,10);
1688   PetscCall(PetscObjectGetComm((PetscObject)B,&comm));
1689   PetscCheck(B->rmap->n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1690   PetscCheck(B->rmap->N == B->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1691   PetscCall(MatCreate(comm,&A));
1692   PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N));
1693 #if defined(PETSC_H2OPUS_USE_GPU)
1694   {
1695     PetscBool iscuda;
1696     VecType   vtype;
1697 
1698     PetscCall(MatGetVecType(B,&vtype));
1699     PetscCall(PetscStrcmp(vtype,VECCUDA,&iscuda));
1700     if (!iscuda) {
1701       PetscCall(PetscStrcmp(vtype,VECSEQCUDA,&iscuda));
1702       if (!iscuda) {
1703         PetscCall(PetscStrcmp(vtype,VECMPICUDA,&iscuda));
1704       }
1705     }
1706     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1707   }
1708 #endif
1709   PetscCall(MatSetType(A,MATH2OPUS));
1710   PetscCall(MatBindToCPU(A,boundtocpu));
1711   if (spacedim) {
1712     PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL));
1713   }
1714   PetscCall(MatPropagateSymmetryOptions(B,A));
1715   /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1716 
1717   h2opus = (Mat_H2OPUS*)A->data;
1718   h2opus->sampler = new PetscMatrixSampler(B);
1719   if (eta > 0.) h2opus->eta = eta;
1720   if (leafsize > 0) h2opus->leafsize = leafsize;
1721   if (maxrank > 0) h2opus->max_rank = maxrank;
1722   if (bs > 0) h2opus->bs = bs;
1723   if (rtol > 0.) h2opus->rtol = rtol;
1724   *nA = A;
1725   A->preallocated = PETSC_TRUE;
1726   PetscFunctionReturn(0);
1727 }
1728 
1729 /*@C
1730      MatH2OpusGetIndexMap - Access reordering index set.
1731 
1732    Input Parameters:
1733 .     A - the matrix
1734 
1735    Output Parameter:
1736 .     indexmap - the index set for the reordering
1737 
1738    Level: intermediate
1739 
1740 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1741 @*/
1742 PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1743 {
1744   PetscBool      ish2opus;
1745   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1746 
1747   PetscFunctionBegin;
1748   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1749   PetscValidType(A,1);
1750   PetscValidPointer(indexmap,2);
1751   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1752   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1753   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1754   *indexmap = a->h2opus_indexmap;
1755   PetscFunctionReturn(0);
1756 }
1757 
1758 /*@C
1759      MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1760 
1761    Input Parameters:
1762 +     A - the matrix
1763 .     nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1764 -     in - the vector to be mapped
1765 
1766    Output Parameter:
1767 .     out - the newly created mapped vector
1768 
1769    Level: intermediate
1770 
1771 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1772 @*/
1773 PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out)
1774 {
1775   PetscBool      ish2opus;
1776   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1777   PetscScalar    *xin,*xout;
1778   PetscBool      nm;
1779 
1780   PetscFunctionBegin;
1781   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1782   PetscValidType(A,1);
1783   PetscValidLogicalCollectiveBool(A,nativetopetsc,2);
1784   PetscValidHeaderSpecific(in,VEC_CLASSID,3);
1785   PetscValidPointer(out,4);
1786   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1787   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1788   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1789   nm   = a->nativemult;
1790   PetscCall(MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc));
1791   PetscCall(MatCreateVecs(A,out,NULL));
1792   PetscCall(MatH2OpusSetNativeMult(A,nm));
1793   if (!a->sf) { /* same ordering */
1794     PetscCall(VecCopy(in,*out));
1795     PetscFunctionReturn(0);
1796   }
1797   PetscCall(VecGetArrayRead(in,(const PetscScalar**)&xin));
1798   PetscCall(VecGetArrayWrite(*out,&xout));
1799   if (nativetopetsc) {
1800     PetscCall(PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1801     PetscCall(PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1802   } else {
1803     PetscCall(PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1804     PetscCall(PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1805   }
1806   PetscCall(VecRestoreArrayRead(in,(const PetscScalar**)&xin));
1807   PetscCall(VecRestoreArrayWrite(*out,&xout));
1808   PetscFunctionReturn(0);
1809 }
1810 
1811 /*@C
1812      MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T
1813 
1814    Input Parameters:
1815 +     A - the hierarchical matrix
1816 .     s - the scaling factor
1817 .     U - the dense low-rank update matrix
1818 -     V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed)
1819 
1820    Notes: The U and V matrices must be in dense format
1821 
1822    Level: intermediate
1823 
1824 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1825 @*/
1826 PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1827 {
1828   PetscBool      flg;
1829 
1830   PetscFunctionBegin;
1831   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1832   PetscValidType(A,1);
1833   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1834   PetscValidHeaderSpecific(U,MAT_CLASSID,2);
1835   PetscCheckSameComm(A,1,U,2);
1836   if (V) {
1837     PetscValidHeaderSpecific(V,MAT_CLASSID,3);
1838     PetscCheckSameComm(A,1,V,3);
1839   }
1840   PetscValidLogicalCollectiveScalar(A,s,4);
1841 
1842   if (!V) V = U;
1843   PetscCheck(U->cmap->N == V->cmap->N,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT,U->cmap->N,V->cmap->N);
1844   if (!U->cmap->N) PetscFunctionReturn(0);
1845   PetscCall(PetscLayoutCompare(U->rmap,A->rmap,&flg));
1846   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A and U must have the same row layout");
1847   PetscCall(PetscLayoutCompare(V->rmap,A->cmap,&flg));
1848   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A column layout must match V row column layout");
1849   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&flg));
1850   if (flg) {
1851     Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
1852     const PetscScalar *u,*v,*uu,*vv;
1853     PetscInt          ldu,ldv;
1854     PetscMPIInt       size;
1855 #if defined(H2OPUS_USE_MPI)
1856     h2opusHandle_t    handle = a->handle->handle;
1857 #else
1858     h2opusHandle_t    handle = a->handle;
1859 #endif
1860     PetscBool         usesf = (PetscBool)(a->sf && !a->nativemult);
1861     PetscSF           usf,vsf;
1862 
1863     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1864     PetscCheck(size <= 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet implemented in parallel");
1865     PetscCall(PetscLogEventBegin(MAT_H2Opus_LR,A,0,0,0));
1866     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U,&flg,MATSEQDENSE,MATMPIDENSE,""));
1867     PetscCheck(flg,PetscObjectComm((PetscObject)U),PETSC_ERR_SUP,"Not for U of type %s",((PetscObject)U)->type_name);
1868     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V,&flg,MATSEQDENSE,MATMPIDENSE,""));
1869     PetscCheck(flg,PetscObjectComm((PetscObject)V),PETSC_ERR_SUP,"Not for V of type %s",((PetscObject)V)->type_name);
1870     PetscCall(MatDenseGetLDA(U,&ldu));
1871     PetscCall(MatDenseGetLDA(V,&ldv));
1872     PetscCall(MatBoundToCPU(A,&flg));
1873     if (usesf) {
1874       PetscInt n;
1875 
1876       PetscCall(MatDenseGetH2OpusVectorSF(U,a->sf,&usf));
1877       PetscCall(MatDenseGetH2OpusVectorSF(V,a->sf,&vsf));
1878       PetscCall(MatH2OpusResizeBuffers_Private(A,U->cmap->N,V->cmap->N));
1879       PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL));
1880       ldu = n;
1881       ldv = n;
1882     }
1883     if (flg) {
1884       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1885       PetscCall(MatDenseGetArrayRead(U,&u));
1886       PetscCall(MatDenseGetArrayRead(V,&v));
1887       if (usesf) {
1888         vv = MatH2OpusGetThrustPointer(*a->yy);
1889         PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1890         PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1891         if (U != V) {
1892           uu = MatH2OpusGetThrustPointer(*a->xx);
1893           PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1894           PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1895         } else uu = vv;
1896       } else { uu = u; vv = v; }
1897       hlru_global(*a->hmatrix,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1898       PetscCall(MatDenseRestoreArrayRead(U,&u));
1899       PetscCall(MatDenseRestoreArrayRead(V,&v));
1900     } else {
1901 #if defined(PETSC_H2OPUS_USE_GPU)
1902       PetscBool flgU, flgV;
1903 
1904       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1905       PetscCall(PetscObjectTypeCompareAny((PetscObject)U,&flgU,MATSEQDENSE,MATMPIDENSE,""));
1906       if (flgU) PetscCall(MatConvert(U,MATDENSECUDA,MAT_INPLACE_MATRIX,&U));
1907       PetscCall(PetscObjectTypeCompareAny((PetscObject)V,&flgV,MATSEQDENSE,MATMPIDENSE,""));
1908       if (flgV) PetscCall(MatConvert(V,MATDENSECUDA,MAT_INPLACE_MATRIX,&V));
1909       PetscCall(MatDenseCUDAGetArrayRead(U,&u));
1910       PetscCall(MatDenseCUDAGetArrayRead(V,&v));
1911       if (usesf) {
1912         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1913         PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1914         PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1915         if (U != V) {
1916           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1917           PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1918           PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1919         } else uu = vv;
1920       } else { uu = u; vv = v; }
1921 #else
1922       SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"This should not happen");
1923 #endif
1924       hlru_global(*a->hmatrix_gpu,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1925 #if defined(PETSC_H2OPUS_USE_GPU)
1926       PetscCall(MatDenseCUDARestoreArrayRead(U,&u));
1927       PetscCall(MatDenseCUDARestoreArrayRead(V,&v));
1928       if (flgU) PetscCall(MatConvert(U,MATDENSE,MAT_INPLACE_MATRIX,&U));
1929       if (flgV) PetscCall(MatConvert(V,MATDENSE,MAT_INPLACE_MATRIX,&V));
1930 #endif
1931     }
1932     PetscCall(PetscLogEventEnd(MAT_H2Opus_LR,A,0,0,0));
1933     a->orthogonal = PETSC_FALSE;
1934   }
1935   PetscFunctionReturn(0);
1936 }
1937 #endif
1938