xref: /petsc/src/mat/impls/h2opus/cuda/math2opus.cu (revision fdf6c4e30aafdbc795e4f76379caa977fd5cdf5a)
1 #include <h2opusconf.h>
2 /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
3 #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU)
4 #include <h2opus.h>
5 #if defined(H2OPUS_USE_MPI)
6 #include <h2opus/distributed/distributed_h2opus_handle.h>
7 #include <h2opus/distributed/distributed_geometric_construction.h>
8 #include <h2opus/distributed/distributed_hgemv.h>
9 #include <h2opus/distributed/distributed_horthog.h>
10 #include <h2opus/distributed/distributed_hcompress.h>
11 #endif
12 #include <h2opus/util/boxentrygen.h>
13 #include <petsc/private/matimpl.h>
14 #include <petsc/private/vecimpl.h>
15 #include <petsc/private/deviceimpl.h>
16 #include <petscsf.h>
17 
18 /* math2opusutils */
19 PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*);
20 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat,PetscSF,PetscSF*);
21 PETSC_INTERN PetscErrorCode VecSign(Vec,Vec);
22 PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt);
23 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*);
24 
25 #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
26 
27 /* Use GPU only if H2OPUS is configured for GPU */
28 #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
29 #define PETSC_H2OPUS_USE_GPU
30 #endif
31 #if defined(PETSC_H2OPUS_USE_GPU)
32 #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B)))
33 #else
34 #define MatH2OpusUpdateIfNeeded(A,B) 0
35 #endif
36 
37 // TODO H2OPUS:
38 // DistributedHMatrix
39 //   unsymmetric ?
40 //   transpose for distributed_hgemv?
41 //   clearData()
42 // Unify interface for sequential and parallel?
43 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
44 //
45 template <class T> class PetscPointCloud : public H2OpusDataSet<T>
46 {
47   private:
48     int dimension;
49     size_t num_points;
50     std::vector<T> pts;
51 
52   public:
53     PetscPointCloud(int dim, size_t num_pts, const T coords[])
54     {
55       dim = dim > 0 ? dim : 1;
56       this->dimension = dim;
57       this->num_points = num_pts;
58 
59       pts.resize(num_pts*dim);
60       if (coords) {
61         for (size_t n = 0; n < num_pts; n++)
62           for (int i = 0; i < dim; i++)
63             pts[n*dim + i] = coords[n*dim + i];
64       } else {
65         PetscReal h = 1.0; //num_pts > 1 ? 1./(num_pts - 1) : 0.0;
66         for (size_t n = 0; n < num_pts; n++) {
67           pts[n*dim] = n*h;
68           for (int i = 1; i < dim; i++)
69             pts[n*dim + i] = 0.0;
70         }
71       }
72     }
73 
74     PetscPointCloud(const PetscPointCloud<T>& other)
75     {
76       size_t N = other.dimension * other.num_points;
77       this->dimension = other.dimension;
78       this->num_points = other.num_points;
79       this->pts.resize(N);
80       for (size_t i = 0; i < N; i++)
81         this->pts[i] = other.pts[i];
82     }
83 
84     int getDimension() const
85     {
86         return dimension;
87     }
88 
89     size_t getDataSetSize() const
90     {
91         return num_points;
92     }
93 
94     T getDataPoint(size_t idx, int dim) const
95     {
96         assert(dim < dimension && idx < num_points);
97         return pts[idx*dimension + dim];
98     }
99 
100     void Print(std::ostream& out = std::cout)
101     {
102        out << "Dimension: " << dimension << std::endl;
103        out << "NumPoints: " << num_points << std::endl;
104        for (size_t n = 0; n < num_points; n++) {
105          for (int d = 0; d < dimension; d++)
106            out << pts[n*dimension + d] << " ";
107          out << std::endl;
108        }
109     }
110 };
111 
112 template<class T> class PetscFunctionGenerator
113 {
114 private:
115   MatH2OpusKernel k;
116   int             dim;
117   void            *ctx;
118 
119 public:
120     PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; }
121     PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; }
122     T operator()(PetscReal *pt1, PetscReal *pt2)
123     {
124         return (T)((*this->k)(this->dim,pt1,pt2,this->ctx));
125     }
126 };
127 
128 #include <../src/mat/impls/h2opus/math2opussampler.hpp>
129 
130 /* just to not clutter the code */
131 #if !defined(H2OPUS_USE_GPU)
132 typedef HMatrix HMatrix_GPU;
133 #if defined(H2OPUS_USE_MPI)
134 typedef DistributedHMatrix DistributedHMatrix_GPU;
135 #endif
136 #endif
137 
138 typedef struct {
139 #if defined(H2OPUS_USE_MPI)
140   distributedH2OpusHandle_t handle;
141 #else
142   h2opusHandle_t handle;
143 #endif
144   /* Sequential and parallel matrices are two different classes at the moment */
145   HMatrix *hmatrix;
146 #if defined(H2OPUS_USE_MPI)
147   DistributedHMatrix *dist_hmatrix;
148 #else
149   HMatrix *dist_hmatrix; /* just to not clutter the code */
150 #endif
151   /* May use permutations */
152   PetscSF sf;
153   PetscLayout h2opus_rmap, h2opus_cmap;
154   IS h2opus_indexmap;
155   thrust::host_vector<PetscScalar> *xx,*yy;
156   PetscInt xxs,yys;
157   PetscBool multsetup;
158 
159   /* GPU */
160   HMatrix_GPU *hmatrix_gpu;
161 #if defined(H2OPUS_USE_MPI)
162   DistributedHMatrix_GPU *dist_hmatrix_gpu;
163 #else
164   HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
165 #endif
166 #if defined(PETSC_H2OPUS_USE_GPU)
167   thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
168   PetscInt xxs_gpu,yys_gpu;
169 #endif
170 
171   /* construction from matvecs */
172   PetscMatrixSampler* sampler;
173   PetscBool nativemult;
174 
175   /* Admissibility */
176   PetscReal eta;
177   PetscInt  leafsize;
178 
179   /* for dof reordering */
180   PetscPointCloud<PetscReal> *ptcloud;
181 
182   /* kernel for generating matrix entries */
183   PetscFunctionGenerator<PetscScalar> *kernel;
184 
185   /* basis orthogonalized? */
186   PetscBool orthogonal;
187 
188   /* customization */
189   PetscInt  basisord;
190   PetscInt  max_rank;
191   PetscInt  bs;
192   PetscReal rtol;
193   PetscInt  norm_max_samples;
194   PetscBool check_construction;
195   PetscBool hara_verbose;
196   PetscBool resize;
197 
198   /* keeps track of MatScale values */
199   PetscScalar s;
200 } Mat_H2OPUS;
201 
202 static PetscErrorCode MatDestroy_H2OPUS(Mat A)
203 {
204   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
205 
206   PetscFunctionBegin;
207 #if defined(H2OPUS_USE_MPI)
208   h2opusDestroyDistributedHandle(a->handle);
209 #else
210   h2opusDestroyHandle(a->handle);
211 #endif
212   delete a->dist_hmatrix;
213   delete a->hmatrix;
214   PetscCall(PetscSFDestroy(&a->sf));
215   PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
216   PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
217   PetscCall(ISDestroy(&a->h2opus_indexmap));
218   delete a->xx;
219   delete a->yy;
220   delete a->hmatrix_gpu;
221   delete a->dist_hmatrix_gpu;
222 #if defined(PETSC_H2OPUS_USE_GPU)
223   delete a->xx_gpu;
224   delete a->yy_gpu;
225 #endif
226   delete a->sampler;
227   delete a->ptcloud;
228   delete a->kernel;
229   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL));
230   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL));
231   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL));
232   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL));
233   PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL));
234   PetscCall(PetscFree(A->data));
235   PetscFunctionReturn(0);
236 }
237 
238 PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
239 {
240   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
241   PetscBool      ish2opus;
242 
243   PetscFunctionBegin;
244   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
245   PetscValidLogicalCollectiveBool(A,nm,2);
246   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
247   if (ish2opus) {
248     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
249       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
250         PetscLayout t;
251         t = A->rmap;
252         A->rmap = a->h2opus_rmap;
253         a->h2opus_rmap = t;
254         t = A->cmap;
255         A->cmap = a->h2opus_cmap;
256         a->h2opus_cmap = t;
257       }
258     }
259     a->nativemult = nm;
260   }
261   PetscFunctionReturn(0);
262 }
263 
264 PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
265 {
266   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
267   PetscBool      ish2opus;
268 
269   PetscFunctionBegin;
270   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
271   PetscValidPointer(nm,2);
272   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
273   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
274   *nm = a->nativemult;
275   PetscFunctionReturn(0);
276 }
277 
278 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n)
279 {
280   PetscBool      ish2opus;
281   PetscInt       nmax = PETSC_DECIDE;
282   Mat_H2OPUS     *a = NULL;
283   PetscBool      mult = PETSC_FALSE;
284 
285   PetscFunctionBegin;
286   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
287   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
288     a = (Mat_H2OPUS*)A->data;
289 
290     nmax = a->norm_max_samples;
291     mult = a->nativemult;
292     PetscCall(MatH2OpusSetNativeMult(A,PETSC_TRUE));
293   } else {
294     PetscCall(PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL));
295   }
296   PetscCall(MatApproximateNorm_Private(A,normtype,nmax,n));
297   if (a) PetscCall(MatH2OpusSetNativeMult(A,mult));
298   PetscFunctionReturn(0);
299 }
300 
301 static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
302 {
303   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
304   PetscInt       n;
305   PetscBool      boundtocpu = PETSC_TRUE;
306 
307   PetscFunctionBegin;
308 #if defined(PETSC_H2OPUS_USE_GPU)
309   boundtocpu = A->boundtocpu;
310 #endif
311   PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
312   if (boundtocpu) {
313     if (h2opus->xxs < xN) { h2opus->xx->resize(n*xN); h2opus->xxs = xN; }
314     if (h2opus->yys < yN) { h2opus->yy->resize(n*yN); h2opus->yys = yN; }
315   }
316 #if defined(PETSC_H2OPUS_USE_GPU)
317   if (!boundtocpu) {
318     if (h2opus->xxs_gpu < xN) { h2opus->xx_gpu->resize(n*xN); h2opus->xxs_gpu = xN; }
319     if (h2opus->yys_gpu < yN) { h2opus->yy_gpu->resize(n*yN); h2opus->yys_gpu = yN; }
320   }
321 #endif
322   PetscFunctionReturn(0);
323 }
324 
325 static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
326 {
327   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
328 #if defined(H2OPUS_USE_MPI)
329   h2opusHandle_t handle = h2opus->handle->handle;
330 #else
331   h2opusHandle_t handle = h2opus->handle;
332 #endif
333   PetscBool      boundtocpu = PETSC_TRUE;
334   PetscScalar    *xx,*yy,*uxx,*uyy;
335   PetscInt       blda,clda;
336   PetscMPIInt    size;
337   PetscSF        bsf,csf;
338   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
339 
340   PetscFunctionBegin;
341   HLibProfile::clear();
342 #if defined(PETSC_H2OPUS_USE_GPU)
343   boundtocpu = A->boundtocpu;
344 #endif
345   PetscCall(MatDenseGetLDA(B,&blda));
346   PetscCall(MatDenseGetLDA(C,&clda));
347   if (usesf) {
348     PetscInt n;
349 
350     PetscCall(MatDenseGetH2OpusVectorSF(B,h2opus->sf,&bsf));
351     PetscCall(MatDenseGetH2OpusVectorSF(C,h2opus->sf,&csf));
352 
353     PetscCall(MatH2OpusResizeBuffers_Private(A,B->cmap->N,C->cmap->N));
354     PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
355     blda = n;
356     clda = n;
357   }
358   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
359   if (boundtocpu) {
360     PetscCall(MatDenseGetArrayRead(B,(const PetscScalar**)&xx));
361     PetscCall(MatDenseGetArrayWrite(C,&yy));
362     if (usesf) {
363       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx);
364       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy);
365       PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
366       PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
367     } else {
368       uxx = xx;
369       uyy = yy;
370     }
371     if (size > 1) {
372       PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
373       PetscCheck(!transA || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
374 #if defined(H2OPUS_USE_MPI)
375       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
376 #endif
377     } else {
378       PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
379       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
380     }
381     PetscCall(MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx));
382     if (usesf) {
383       PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
384       PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
385     }
386     PetscCall(MatDenseRestoreArrayWrite(C,&yy));
387 #if defined(PETSC_H2OPUS_USE_GPU)
388   } else {
389     PetscBool ciscuda,biscuda;
390 
391     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
392     PetscCall(PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
393     if (!biscuda) {
394       PetscCall(MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B));
395     }
396     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
397     if (!ciscuda) {
398       C->assembled = PETSC_TRUE;
399       PetscCall(MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C));
400     }
401     PetscCall(MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx));
402     PetscCall(MatDenseCUDAGetArrayWrite(C,&yy));
403     if (usesf) {
404       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
405       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
406       PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
407       PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
408     } else {
409       uxx = xx;
410       uyy = yy;
411     }
412     PetscCall(PetscLogGpuTimeBegin());
413     if (size > 1) {
414       PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
415       PetscCheck(!transA || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
416 #if defined(H2OPUS_USE_MPI)
417       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
418 #endif
419     } else {
420       PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
421       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
422     }
423     PetscCall(PetscLogGpuTimeEnd());
424     PetscCall(MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx));
425     if (usesf) {
426       PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
427       PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
428     }
429     PetscCall(MatDenseCUDARestoreArrayWrite(C,&yy));
430     if (!biscuda) {
431       PetscCall(MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B));
432     }
433     if (!ciscuda) {
434       PetscCall(MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C));
435     }
436 #endif
437   }
438   { /* log flops */
439     double gops,time,perf,dev;
440     HLibProfile::getHgemvPerf(gops,time,perf,dev);
441 #if defined(PETSC_H2OPUS_USE_GPU)
442     if (boundtocpu) {
443       PetscCall(PetscLogFlops(1e9*gops));
444     } else {
445       PetscCall(PetscLogGpuFlops(1e9*gops));
446     }
447 #else
448     PetscCall(PetscLogFlops(1e9*gops));
449 #endif
450   }
451   PetscFunctionReturn(0);
452 }
453 
454 static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
455 {
456   Mat_Product    *product = C->product;
457 
458   PetscFunctionBegin;
459   MatCheckProduct(C,1);
460   switch (product->type) {
461   case MATPRODUCT_AB:
462     PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C));
463     break;
464   case MATPRODUCT_AtB:
465     PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C));
466     break;
467   default:
468     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
469   }
470   PetscFunctionReturn(0);
471 }
472 
473 static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
474 {
475   Mat_Product    *product = C->product;
476   PetscBool      cisdense;
477   Mat            A,B;
478 
479   PetscFunctionBegin;
480   MatCheckProduct(C,1);
481   A = product->A;
482   B = product->B;
483   switch (product->type) {
484   case MATPRODUCT_AB:
485     PetscCall(MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N));
486     PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B));
487     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
488     if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name));
489     PetscCall(MatSetUp(C));
490     break;
491   case MATPRODUCT_AtB:
492     PetscCall(MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N));
493     PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B));
494     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
495     if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name));
496     PetscCall(MatSetUp(C));
497     break;
498   default:
499     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
500   }
501   C->ops->productsymbolic = NULL;
502   C->ops->productnumeric = MatProductNumeric_H2OPUS;
503   PetscFunctionReturn(0);
504 }
505 
506 static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
507 {
508   PetscFunctionBegin;
509   MatCheckProduct(C,1);
510   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
511     C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
512   }
513   PetscFunctionReturn(0);
514 }
515 
516 static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
517 {
518   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
519 #if defined(H2OPUS_USE_MPI)
520   h2opusHandle_t handle = h2opus->handle->handle;
521 #else
522   h2opusHandle_t handle = h2opus->handle;
523 #endif
524   PetscBool      boundtocpu = PETSC_TRUE;
525   PetscInt       n;
526   PetscScalar    *xx,*yy,*uxx,*uyy;
527   PetscMPIInt    size;
528   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
529 
530   PetscFunctionBegin;
531   HLibProfile::clear();
532   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
533 #if defined(PETSC_H2OPUS_USE_GPU)
534   boundtocpu = A->boundtocpu;
535 #endif
536   if (usesf) {
537     PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
538   } else n = A->rmap->n;
539   if (boundtocpu) {
540     PetscCall(VecGetArrayRead(x,(const PetscScalar**)&xx));
541     if (sy == 0.0) {
542       PetscCall(VecGetArrayWrite(y,&yy));
543     } else {
544       PetscCall(VecGetArray(y,&yy));
545     }
546     if (usesf) {
547       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
548       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
549 
550       PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
551       PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
552       if (sy != 0.0) {
553         PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
554         PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
555       }
556     } else {
557       uxx = xx;
558       uyy = yy;
559     }
560     if (size > 1) {
561       PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
562       PetscCheck(!trans || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
563 #if defined(H2OPUS_USE_MPI)
564       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
565 #endif
566     } else {
567       PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
568       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
569     }
570     PetscCall(VecRestoreArrayRead(x,(const PetscScalar**)&xx));
571     if (usesf) {
572       PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
573       PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
574     }
575     if (sy == 0.0) {
576       PetscCall(VecRestoreArrayWrite(y,&yy));
577     } else {
578       PetscCall(VecRestoreArray(y,&yy));
579     }
580 #if defined(PETSC_H2OPUS_USE_GPU)
581   } else {
582     PetscCall(VecCUDAGetArrayRead(x,(const PetscScalar**)&xx));
583     if (sy == 0.0) {
584       PetscCall(VecCUDAGetArrayWrite(y,&yy));
585     } else {
586       PetscCall(VecCUDAGetArray(y,&yy));
587     }
588     if (usesf) {
589       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
590       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
591 
592       PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
593       PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
594       if (sy != 0.0) {
595         PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
596         PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
597       }
598     } else {
599       uxx = xx;
600       uyy = yy;
601     }
602     PetscCall(PetscLogGpuTimeBegin());
603     if (size > 1) {
604       PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
605       PetscCheck(!trans || A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
606 #if defined(H2OPUS_USE_MPI)
607       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
608 #endif
609     } else {
610       PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
611       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
612     }
613     PetscCall(PetscLogGpuTimeEnd());
614     PetscCall(VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx));
615     if (usesf) {
616       PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
617       PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
618     }
619     if (sy == 0.0) {
620       PetscCall(VecCUDARestoreArrayWrite(y,&yy));
621     } else {
622       PetscCall(VecCUDARestoreArray(y,&yy));
623     }
624 #endif
625   }
626   { /* log flops */
627     double gops,time,perf,dev;
628     HLibProfile::getHgemvPerf(gops,time,perf,dev);
629 #if defined(PETSC_H2OPUS_USE_GPU)
630     if (boundtocpu) {
631       PetscCall(PetscLogFlops(1e9*gops));
632     } else {
633       PetscCall(PetscLogGpuFlops(1e9*gops));
634     }
635 #else
636     PetscCall(PetscLogFlops(1e9*gops));
637 #endif
638   }
639   PetscFunctionReturn(0);
640 }
641 
642 static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
643 {
644   PetscBool      xiscuda,yiscuda;
645 
646   PetscFunctionBegin;
647   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
648   PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,""));
649   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda));
650   PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE));
651   PetscFunctionReturn(0);
652 }
653 
654 static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
655 {
656   PetscBool      xiscuda,yiscuda;
657 
658   PetscFunctionBegin;
659   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
660   PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,""));
661   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda));
662   PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE));
663   PetscFunctionReturn(0);
664 }
665 
666 static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
667 {
668   PetscBool      xiscuda,ziscuda;
669 
670   PetscFunctionBegin;
671   PetscCall(VecCopy(y,z));
672   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
673   PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,""));
674   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda));
675   PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE));
676   PetscFunctionReturn(0);
677 }
678 
679 static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
680 {
681   PetscBool      xiscuda,ziscuda;
682 
683   PetscFunctionBegin;
684   PetscCall(VecCopy(y,z));
685   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
686   PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,""));
687   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda));
688   PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE));
689   PetscFunctionReturn(0);
690 }
691 
692 static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
693 {
694   Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
695 
696   PetscFunctionBegin;
697   a->s *= s;
698   PetscFunctionReturn(0);
699 }
700 
701 static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A)
702 {
703   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
704 
705   PetscFunctionBegin;
706   PetscOptionsHeadBegin(PetscOptionsObject,"H2OPUS options");
707   PetscCall(PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL));
708   PetscCall(PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL));
709   PetscCall(PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL));
710   PetscCall(PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL));
711   PetscCall(PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL));
712   PetscCall(PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL));
713   PetscCall(PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL));
714   PetscCall(PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL));
715   PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL));
716   PetscCall(PetscOptionsBool("-mat_h2opus_resize","Resize after compression",NULL,a->resize,&a->resize,NULL));
717   PetscOptionsHeadEnd();
718   PetscFunctionReturn(0);
719 }
720 
721 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*);
722 
723 static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
724 {
725   Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
726   Vec               c;
727   PetscInt          spacedim;
728   const PetscScalar *coords;
729 
730   PetscFunctionBegin;
731   if (a->ptcloud) PetscFunctionReturn(0);
732   PetscCall(PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c));
733   if (!c && a->sampler) {
734     Mat S = a->sampler->GetSamplingMat();
735 
736     PetscCall(PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c));
737   }
738   if (!c) {
739     PetscCall(MatH2OpusSetCoords_H2OPUS(A,-1,NULL,PETSC_FALSE,NULL,NULL));
740   } else {
741     PetscCall(VecGetArrayRead(c,&coords));
742     PetscCall(VecGetBlockSize(c,&spacedim));
743     PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL));
744     PetscCall(VecRestoreArrayRead(c,&coords));
745   }
746   PetscFunctionReturn(0);
747 }
748 
749 static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
750 {
751   MPI_Comm       comm;
752   PetscMPIInt    size;
753   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
754   PetscInt       n = 0,*idx = NULL;
755   int            *iidx = NULL;
756   PetscCopyMode  own;
757   PetscBool      rid;
758 
759   PetscFunctionBegin;
760   if (a->multsetup) PetscFunctionReturn(0);
761   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
762     PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL));
763 #if defined(PETSC_H2OPUS_USE_GPU)
764     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
765     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
766     a->xxs_gpu = 1;
767     a->yys_gpu = 1;
768 #endif
769     a->xx  = new thrust::host_vector<PetscScalar>(n);
770     a->yy  = new thrust::host_vector<PetscScalar>(n);
771     a->xxs = 1;
772     a->yys = 1;
773   } else {
774     IS is;
775     PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
776     PetscCallMPI(MPI_Comm_size(comm,&size));
777     if (!a->h2opus_indexmap) {
778       if (size > 1) {
779         PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
780 #if defined(H2OPUS_USE_MPI)
781         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
782         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
783 #endif
784       } else {
785         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
786         n    = a->hmatrix->u_basis_tree.index_map.size();
787       }
788 
789       if (PetscDefined(USE_64BIT_INDICES)) {
790         PetscInt i;
791 
792         own  = PETSC_OWN_POINTER;
793         PetscCall(PetscMalloc1(n,&idx));
794         for (i=0;i<n;i++) idx[i] = iidx[i];
795       } else {
796         own  = PETSC_COPY_VALUES;
797         idx  = (PetscInt*)iidx;
798       }
799       PetscCall(ISCreateGeneral(comm,n,idx,own,&is));
800       PetscCall(ISSetPermutation(is));
801       PetscCall(ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view"));
802       a->h2opus_indexmap = is;
803     }
804     PetscCall(ISGetLocalSize(a->h2opus_indexmap,&n));
805     PetscCall(ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx));
806     rid  = (PetscBool)(n == A->rmap->n);
807     PetscCall(MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm));
808     if (rid) {
809       PetscCall(ISIdentity(a->h2opus_indexmap,&rid));
810     }
811     if (!rid) {
812       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
813         PetscCall(PetscLayoutCreate(comm,&a->h2opus_rmap));
814         PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap,n));
815         PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
816         PetscCall(PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap));
817       }
818       PetscCall(PetscSFCreate(comm,&a->sf));
819       PetscCall(PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx));
820       PetscCall(PetscSFSetUp(a->sf));
821       PetscCall(PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view"));
822 #if defined(PETSC_H2OPUS_USE_GPU)
823       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
824       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
825       a->xxs_gpu = 1;
826       a->yys_gpu = 1;
827 #endif
828       a->xx  = new thrust::host_vector<PetscScalar>(n);
829       a->yy  = new thrust::host_vector<PetscScalar>(n);
830       a->xxs = 1;
831       a->yys = 1;
832     }
833     PetscCall(ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx));
834   }
835   a->multsetup = PETSC_TRUE;
836   PetscFunctionReturn(0);
837 }
838 
839 static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
840 {
841   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
842 #if defined(H2OPUS_USE_MPI)
843   h2opusHandle_t handle = a->handle->handle;
844 #else
845   h2opusHandle_t handle = a->handle;
846 #endif
847   PetscBool      kernel = PETSC_FALSE;
848   PetscBool      boundtocpu = PETSC_TRUE;
849   PetscBool      samplingdone = PETSC_FALSE;
850   MPI_Comm       comm;
851   PetscMPIInt    size;
852 
853   PetscFunctionBegin;
854   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
855   PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
856   PetscCheck(A->rmap->N == A->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
857 
858   /* XXX */
859   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
860 
861   PetscCallMPI(MPI_Comm_size(comm,&size));
862   /* TODO REUSABILITY of geometric construction */
863   delete a->hmatrix;
864   delete a->dist_hmatrix;
865 #if defined(PETSC_H2OPUS_USE_GPU)
866   delete a->hmatrix_gpu;
867   delete a->dist_hmatrix_gpu;
868 #endif
869   a->orthogonal = PETSC_FALSE;
870 
871   /* TODO: other? */
872   H2OpusBoxCenterAdmissibility adm(a->eta);
873 
874   PetscCall(PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0));
875   if (size > 1) {
876 #if defined(H2OPUS_USE_MPI)
877     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/* ,A->symmetric */);
878 #else
879     a->dist_hmatrix = NULL;
880 #endif
881   } else a->hmatrix = new HMatrix(A->rmap->n,A->symmetric == PETSC_BOOL3_TRUE);
882   PetscCall(MatH2OpusInferCoordinates_Private(A));
883   PetscCheck(a->ptcloud,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
884   if (a->kernel) {
885     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
886     if (size > 1) {
887       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
888 #if defined(H2OPUS_USE_MPI)
889       buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle);
890 #endif
891     } else {
892       buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord);
893     }
894     kernel = PETSC_TRUE;
895   } else {
896     PetscCheck(size <= 1,comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
897     buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm);
898   }
899   PetscCall(MatSetUpMultiply_H2OPUS(A));
900 
901 #if defined(PETSC_H2OPUS_USE_GPU)
902   boundtocpu = A->boundtocpu;
903   if (!boundtocpu) {
904     if (size > 1) {
905       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
906 #if defined(H2OPUS_USE_MPI)
907       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
908 #endif
909     } else {
910       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
911     }
912   }
913 #endif
914   if (size == 1) {
915     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
916       PetscReal Anorm;
917       bool      verbose;
918 
919       PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL));
920       verbose = a->hara_verbose;
921       PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm));
922       if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs));
923       if (a->sf && !a->nativemult) {
924         a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data());
925       }
926       a->sampler->SetStream(handle->getMainStream());
927       if (boundtocpu) {
928         a->sampler->SetGPUSampling(false);
929         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
930 #if defined(PETSC_H2OPUS_USE_GPU)
931       } else {
932         a->sampler->SetGPUSampling(true);
933         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
934 #endif
935       }
936       samplingdone = PETSC_TRUE;
937     }
938   }
939 #if defined(PETSC_H2OPUS_USE_GPU)
940   if (!boundtocpu) {
941     delete a->hmatrix;
942     delete a->dist_hmatrix;
943     a->hmatrix = NULL;
944     a->dist_hmatrix = NULL;
945   }
946   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
947 #endif
948   PetscCall(PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0));
949 
950   if (!a->s) a->s = 1.0;
951   A->assembled = PETSC_TRUE;
952 
953   if (samplingdone) {
954     PetscBool check = a->check_construction;
955     PetscBool checke = PETSC_FALSE;
956 
957     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL));
958     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL));
959     if (check) {
960       Mat       E,Ae;
961       PetscReal n1,ni,n2;
962       PetscReal n1A,niA,n2A;
963       void      (*normfunc)(void);
964 
965       Ae   = a->sampler->GetSamplingMat();
966       PetscCall(MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E));
967       PetscCall(MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
968       PetscCall(MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN));
969       PetscCall(MatNorm(E,NORM_1,&n1));
970       PetscCall(MatNorm(E,NORM_INFINITY,&ni));
971       PetscCall(MatNorm(E,NORM_2,&n2));
972       if (checke) {
973         Mat eA,eE,eAe;
974 
975         PetscCall(MatComputeOperator(A,MATAIJ,&eA));
976         PetscCall(MatComputeOperator(E,MATAIJ,&eE));
977         PetscCall(MatComputeOperator(Ae,MATAIJ,&eAe));
978         PetscCall(MatChop(eA,PETSC_SMALL));
979         PetscCall(MatChop(eE,PETSC_SMALL));
980         PetscCall(MatChop(eAe,PETSC_SMALL));
981         PetscCall(PetscObjectSetName((PetscObject)eA,"H2Mat"));
982         PetscCall(MatView(eA,NULL));
983         PetscCall(PetscObjectSetName((PetscObject)eAe,"S"));
984         PetscCall(MatView(eAe,NULL));
985         PetscCall(PetscObjectSetName((PetscObject)eE,"H2Mat - S"));
986         PetscCall(MatView(eE,NULL));
987         PetscCall(MatDestroy(&eA));
988         PetscCall(MatDestroy(&eE));
989         PetscCall(MatDestroy(&eAe));
990       }
991 
992       PetscCall(MatGetOperation(Ae,MATOP_NORM,&normfunc));
993       PetscCall(MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
994       PetscCall(MatNorm(Ae,NORM_1,&n1A));
995       PetscCall(MatNorm(Ae,NORM_INFINITY,&niA));
996       PetscCall(MatNorm(Ae,NORM_2,&n2A));
997       n1A  = PetscMax(n1A,PETSC_SMALL);
998       n2A  = PetscMax(n2A,PETSC_SMALL);
999       niA  = PetscMax(niA,PETSC_SMALL);
1000       PetscCall(MatSetOperation(Ae,MATOP_NORM,normfunc));
1001       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A)));
1002       PetscCall(MatDestroy(&E));
1003     }
1004     a->sampler->SetSamplingMat(NULL);
1005   }
1006   PetscFunctionReturn(0);
1007 }
1008 
1009 static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1010 {
1011   PetscMPIInt    size;
1012   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1013 
1014   PetscFunctionBegin;
1015   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1016   PetscCheck(size <= 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported");
1017   a->hmatrix->clearData();
1018 #if defined(PETSC_H2OPUS_USE_GPU)
1019   if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1020 #endif
1021   PetscFunctionReturn(0);
1022 }
1023 
1024 static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1025 {
1026   Mat            A;
1027   Mat_H2OPUS     *a, *b = (Mat_H2OPUS*)B->data;
1028 #if defined(PETSC_H2OPUS_USE_GPU)
1029   PetscBool      iscpu = PETSC_FALSE;
1030 #else
1031   PetscBool      iscpu = PETSC_TRUE;
1032 #endif
1033   MPI_Comm       comm;
1034 
1035   PetscFunctionBegin;
1036   PetscCall(PetscObjectGetComm((PetscObject)B,&comm));
1037   PetscCall(MatCreate(comm,&A));
1038   PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N));
1039   PetscCall(MatSetType(A,MATH2OPUS));
1040   PetscCall(MatPropagateSymmetryOptions(B,A));
1041   a = (Mat_H2OPUS*)A->data;
1042 
1043   a->eta              = b->eta;
1044   a->leafsize         = b->leafsize;
1045   a->basisord         = b->basisord;
1046   a->max_rank         = b->max_rank;
1047   a->bs               = b->bs;
1048   a->rtol             = b->rtol;
1049   a->norm_max_samples = b->norm_max_samples;
1050   if (op == MAT_COPY_VALUES) a->s = b->s;
1051 
1052   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1053   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1054 
1055 #if defined(H2OPUS_USE_MPI)
1056   if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1057 #if defined(PETSC_H2OPUS_USE_GPU)
1058   if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1059 #endif
1060 #endif
1061   if (b->hmatrix) {
1062     a->hmatrix = new HMatrix(*b->hmatrix);
1063     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1064   }
1065 #if defined(PETSC_H2OPUS_USE_GPU)
1066   if (b->hmatrix_gpu) {
1067     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1068     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1069   }
1070 #endif
1071   if (b->sf) {
1072     PetscCall(PetscObjectReference((PetscObject)b->sf));
1073     a->sf = b->sf;
1074   }
1075   if (b->h2opus_indexmap) {
1076     PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1077     a->h2opus_indexmap = b->h2opus_indexmap;
1078   }
1079 
1080   PetscCall(MatSetUp(A));
1081   PetscCall(MatSetUpMultiply_H2OPUS(A));
1082   if (op == MAT_COPY_VALUES) {
1083     A->assembled = PETSC_TRUE;
1084     a->orthogonal = b->orthogonal;
1085 #if defined(PETSC_H2OPUS_USE_GPU)
1086     A->offloadmask = B->offloadmask;
1087 #endif
1088   }
1089 #if defined(PETSC_H2OPUS_USE_GPU)
1090   iscpu = B->boundtocpu;
1091 #endif
1092   PetscCall(MatBindToCPU(A,iscpu));
1093 
1094   *nA = A;
1095   PetscFunctionReturn(0);
1096 }
1097 
1098 static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1099 {
1100   Mat_H2OPUS        *h2opus = (Mat_H2OPUS*)A->data;
1101   PetscBool         isascii, vieweps;
1102   PetscMPIInt       size;
1103   PetscViewerFormat format;
1104 
1105   PetscFunctionBegin;
1106   PetscCall(PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii));
1107   PetscCall(PetscViewerGetFormat(view,&format));
1108   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1109   if (isascii) {
1110     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1111       if (size == 1) {
1112         FILE *fp;
1113         PetscCall(PetscViewerASCIIGetPointer(view,&fp));
1114         dumpHMatrix(*h2opus->hmatrix,6,fp);
1115       }
1116     } else {
1117       PetscCall(PetscViewerASCIIPrintf(view,"  H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat"));
1118       PetscCall(PetscViewerASCIIPrintf(view,"  PointCloud dim %" PetscInt_FMT "\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1119       PetscCall(PetscViewerASCIIPrintf(view,"  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n",h2opus->leafsize,(double)h2opus->eta));
1120       if (!h2opus->kernel) {
1121         PetscCall(PetscViewerASCIIPrintf(view,"  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol));
1122       } else {
1123         PetscCall(PetscViewerASCIIPrintf(view,"  Offdiagonal blocks approximation order %" PetscInt_FMT "\n",h2opus->basisord));
1124       }
1125       PetscCall(PetscViewerASCIIPrintf(view,"  Number of samples for norms %" PetscInt_FMT "\n",h2opus->norm_max_samples));
1126       if (size == 1) {
1127         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1128         double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1129 #if defined(PETSC_HAVE_CUDA)
1130         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1131         double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1132 #endif
1133         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1134 #if defined(PETSC_HAVE_CUDA)
1135         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1136 #endif
1137       } else {
1138 #if defined(PETSC_HAVE_CUDA)
1139         double      matrix_mem[4] = {0.,0.,0.,0.};
1140         PetscMPIInt rsize = 4;
1141 #else
1142         double      matrix_mem[2] = {0.,0.};
1143         PetscMPIInt rsize = 2;
1144 #endif
1145 #if defined(H2OPUS_USE_MPI)
1146         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1147         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1148 #if defined(PETSC_HAVE_CUDA)
1149         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1150         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1151 #endif
1152 #endif
1153         PetscCall(MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A)));
1154         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1155 #if defined(PETSC_HAVE_CUDA)
1156         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1157 #endif
1158       }
1159     }
1160   }
1161   vieweps = PETSC_FALSE;
1162   PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_vieweps",&vieweps,NULL));
1163   if (vieweps) {
1164     char filename[256];
1165     const char *name;
1166 
1167     PetscCall(PetscObjectGetName((PetscObject)A,&name));
1168     PetscCall(PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name));
1169     PetscCall(PetscOptionsGetString(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_vieweps_filename",filename,sizeof(filename),NULL));
1170     outputEps(*h2opus->hmatrix,filename);
1171   }
1172   PetscFunctionReturn(0);
1173 }
1174 
1175 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1176 {
1177   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
1178   PetscReal      *gcoords;
1179   PetscInt       N;
1180   MPI_Comm       comm;
1181   PetscMPIInt    size;
1182   PetscBool      cong;
1183 
1184   PetscFunctionBegin;
1185   PetscCall(PetscLayoutSetUp(A->rmap));
1186   PetscCall(PetscLayoutSetUp(A->cmap));
1187   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
1188   PetscCall(MatHasCongruentLayouts(A,&cong));
1189   PetscCheck(cong,comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts");
1190   N = A->rmap->N;
1191   PetscCallMPI(MPI_Comm_size(comm,&size));
1192   if (spacedim > 0 && size > 1 && cdist) {
1193     PetscSF      sf;
1194     MPI_Datatype dtype;
1195 
1196     PetscCallMPI(MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype));
1197     PetscCallMPI(MPI_Type_commit(&dtype));
1198 
1199     PetscCall(PetscSFCreate(comm,&sf));
1200     PetscCall(PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER));
1201     PetscCall(PetscMalloc1(spacedim*N,&gcoords));
1202     PetscCall(PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE));
1203     PetscCall(PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE));
1204     PetscCall(PetscSFDestroy(&sf));
1205     PetscCallMPI(MPI_Type_free(&dtype));
1206   } else gcoords = (PetscReal*)coords;
1207 
1208   delete h2opus->ptcloud;
1209   delete h2opus->kernel;
1210   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords);
1211   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx);
1212   if (gcoords != coords) PetscCall(PetscFree(gcoords));
1213   A->preallocated = PETSC_TRUE;
1214   PetscFunctionReturn(0);
1215 }
1216 
1217 #if defined(PETSC_H2OPUS_USE_GPU)
1218 static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1219 {
1220   PetscMPIInt    size;
1221   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1222 
1223   PetscFunctionBegin;
1224   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1225   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1226     if (size > 1) {
1227       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1228 #if defined(H2OPUS_USE_MPI)
1229       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1230       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1231 #endif
1232     } else {
1233       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1234       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1235       else *a->hmatrix = *a->hmatrix_gpu;
1236     }
1237     delete a->hmatrix_gpu;
1238     delete a->dist_hmatrix_gpu;
1239     a->hmatrix_gpu = NULL;
1240     a->dist_hmatrix_gpu = NULL;
1241     A->offloadmask = PETSC_OFFLOAD_CPU;
1242   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1243     if (size > 1) {
1244       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1245 #if defined(H2OPUS_USE_MPI)
1246       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1247       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1248 #endif
1249     } else {
1250       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1251       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1252       else *a->hmatrix_gpu = *a->hmatrix;
1253     }
1254     delete a->hmatrix;
1255     delete a->dist_hmatrix;
1256     a->hmatrix = NULL;
1257     a->dist_hmatrix = NULL;
1258     A->offloadmask = PETSC_OFFLOAD_GPU;
1259   }
1260   PetscCall(PetscFree(A->defaultvectype));
1261   if (!flg) {
1262     PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
1263   } else {
1264     PetscCall(PetscStrallocpy(VECSTANDARD,&A->defaultvectype));
1265   }
1266   A->boundtocpu = flg;
1267   PetscFunctionReturn(0);
1268 }
1269 #endif
1270 
1271 /*MC
1272      MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.
1273 
1274    Options Database Keys:
1275 .     -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()
1276 
1277    Notes:
1278      H2Opus implements hierarchical matrices in the H^2 flavour.
1279      It supports CPU or NVIDIA GPUs.
1280      For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1281      In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.
1282      For details and additional references, see
1283        "H2Opus: A distributed-memory multi-GPU software package for non-local operators",
1284      available at https://arxiv.org/abs/2109.05451.
1285 
1286    Level: beginner
1287 
1288 .seealso: `MATHTOOL`, `MATDENSE`, `MatCreateH2OpusFromKernel()`, `MatCreateH2OpusFromMat()`
1289 M*/
1290 PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1291 {
1292   Mat_H2OPUS     *a;
1293   PetscMPIInt    size;
1294 
1295   PetscFunctionBegin;
1296 #if defined(PETSC_H2OPUS_USE_GPU)
1297   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1298 #endif
1299   PetscCall(PetscNewLog(A,&a));
1300   A->data = (void*)a;
1301 
1302   a->eta              = 0.9;
1303   a->leafsize         = 32;
1304   a->basisord         = 4;
1305   a->max_rank         = 64;
1306   a->bs               = 32;
1307   a->rtol             = 1.e-4;
1308   a->s                = 1.0;
1309   a->norm_max_samples = 10;
1310   a->resize           = PETSC_TRUE; /* reallocate after compression */
1311 #if defined(H2OPUS_USE_MPI)
1312   h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));
1313 #else
1314   h2opusCreateHandle(&a->handle);
1315 #endif
1316   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1317   PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS));
1318   PetscCall(PetscMemzero(A->ops,sizeof(struct _MatOps)));
1319 
1320   A->ops->destroy          = MatDestroy_H2OPUS;
1321   A->ops->view             = MatView_H2OPUS;
1322   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1323   A->ops->mult             = MatMult_H2OPUS;
1324   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1325   A->ops->multadd          = MatMultAdd_H2OPUS;
1326   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1327   A->ops->scale            = MatScale_H2OPUS;
1328   A->ops->duplicate        = MatDuplicate_H2OPUS;
1329   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1330   A->ops->norm             = MatNorm_H2OPUS;
1331   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1332 #if defined(PETSC_H2OPUS_USE_GPU)
1333   A->ops->bindtocpu        = MatBindToCPU_H2OPUS;
1334 #endif
1335 
1336   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS));
1337   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS));
1338   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS));
1339   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS));
1340 #if defined(PETSC_H2OPUS_USE_GPU)
1341   PetscCall(PetscFree(A->defaultvectype));
1342   PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
1343 #endif
1344   PetscFunctionReturn(0);
1345 }
1346 
1347 /*@C
1348      MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1349 
1350    Input Parameter:
1351 .     A - the matrix
1352 
1353    Level: intermediate
1354 
1355 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`
1356 @*/
1357 PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1358 {
1359   PetscBool      ish2opus;
1360   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1361   PetscMPIInt    size;
1362   PetscBool      boundtocpu = PETSC_TRUE;
1363 
1364   PetscFunctionBegin;
1365   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1366   PetscValidType(A,1);
1367   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1368   if (!ish2opus) PetscFunctionReturn(0);
1369   if (a->orthogonal) PetscFunctionReturn(0);
1370   HLibProfile::clear();
1371   PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0));
1372 #if defined(PETSC_H2OPUS_USE_GPU)
1373   boundtocpu = A->boundtocpu;
1374 #endif
1375   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1376   if (size > 1) {
1377     if (boundtocpu) {
1378       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1379 #if defined(H2OPUS_USE_MPI)
1380       distributed_horthog(*a->dist_hmatrix, a->handle);
1381 #endif
1382 #if defined(PETSC_H2OPUS_USE_GPU)
1383       A->offloadmask = PETSC_OFFLOAD_CPU;
1384     } else {
1385       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1386       PetscCall(PetscLogGpuTimeBegin());
1387 #if defined(H2OPUS_USE_MPI)
1388       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1389 #endif
1390       PetscCall(PetscLogGpuTimeEnd());
1391 #endif
1392     }
1393   } else {
1394 #if defined(H2OPUS_USE_MPI)
1395     h2opusHandle_t handle = a->handle->handle;
1396 #else
1397     h2opusHandle_t handle = a->handle;
1398 #endif
1399     if (boundtocpu) {
1400       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1401       horthog(*a->hmatrix, handle);
1402 #if defined(PETSC_H2OPUS_USE_GPU)
1403       A->offloadmask = PETSC_OFFLOAD_CPU;
1404     } else {
1405       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1406       PetscCall(PetscLogGpuTimeBegin());
1407       horthog(*a->hmatrix_gpu, handle);
1408       PetscCall(PetscLogGpuTimeEnd());
1409 #endif
1410     }
1411   }
1412   a->orthogonal = PETSC_TRUE;
1413   { /* log flops */
1414     double gops,time,perf,dev;
1415     HLibProfile::getHorthogPerf(gops,time,perf,dev);
1416 #if defined(PETSC_H2OPUS_USE_GPU)
1417     if (boundtocpu) {
1418       PetscCall(PetscLogFlops(1e9*gops));
1419     } else {
1420       PetscCall(PetscLogGpuFlops(1e9*gops));
1421     }
1422 #else
1423     PetscCall(PetscLogFlops(1e9*gops));
1424 #endif
1425   }
1426   PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0));
1427   PetscFunctionReturn(0);
1428 }
1429 
1430 /*@C
1431      MatH2OpusCompress - Compress a hierarchical matrix.
1432 
1433    Input Parameters:
1434 +     A - the matrix
1435 -     tol - the absolute truncation threshold
1436 
1437    Level: intermediate
1438 
1439 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusOrthogonalize()`
1440 @*/
1441 PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1442 {
1443   PetscBool      ish2opus;
1444   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1445   PetscMPIInt    size;
1446   PetscBool      boundtocpu = PETSC_TRUE;
1447 
1448   PetscFunctionBegin;
1449   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1450   PetscValidType(A,1);
1451   PetscValidLogicalCollectiveReal(A,tol,2);
1452   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1453   if (!ish2opus || tol <= 0.0) PetscFunctionReturn(0);
1454   PetscCall(MatH2OpusOrthogonalize(A));
1455   HLibProfile::clear();
1456   PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0));
1457 #if defined(PETSC_H2OPUS_USE_GPU)
1458   boundtocpu = A->boundtocpu;
1459 #endif
1460   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1461   if (size > 1) {
1462     if (boundtocpu) {
1463       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1464 #if defined(H2OPUS_USE_MPI)
1465       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1466       if (a->resize) {
1467         DistributedHMatrix *dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix);
1468         delete a->dist_hmatrix;
1469         a->dist_hmatrix = dist_hmatrix;
1470       }
1471 #endif
1472 #if defined(PETSC_H2OPUS_USE_GPU)
1473       A->offloadmask = PETSC_OFFLOAD_CPU;
1474     } else {
1475       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1476       PetscCall(PetscLogGpuTimeBegin());
1477 #if defined(H2OPUS_USE_MPI)
1478       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1479 
1480       if (a->resize) {
1481         DistributedHMatrix_GPU *dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix_gpu);
1482         delete a->dist_hmatrix_gpu;
1483         a->dist_hmatrix_gpu = dist_hmatrix_gpu;
1484       }
1485 #endif
1486       PetscCall(PetscLogGpuTimeEnd());
1487 #endif
1488     }
1489   } else {
1490 #if defined(H2OPUS_USE_MPI)
1491     h2opusHandle_t handle = a->handle->handle;
1492 #else
1493     h2opusHandle_t handle = a->handle;
1494 #endif
1495     if (boundtocpu) {
1496       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1497       hcompress(*a->hmatrix, tol, handle);
1498 
1499       if (a->resize) {
1500         HMatrix *hmatrix = new HMatrix(*a->hmatrix);
1501         delete a->hmatrix;
1502         a->hmatrix = hmatrix;
1503       }
1504 #if defined(PETSC_H2OPUS_USE_GPU)
1505       A->offloadmask = PETSC_OFFLOAD_CPU;
1506     } else {
1507       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1508       PetscCall(PetscLogGpuTimeBegin());
1509       hcompress(*a->hmatrix_gpu, tol, handle);
1510       PetscCall(PetscLogGpuTimeEnd());
1511 
1512       if (a->resize) {
1513         HMatrix_GPU *hmatrix_gpu = new HMatrix_GPU(*a->hmatrix_gpu);
1514         delete a->hmatrix_gpu;
1515         a->hmatrix_gpu = hmatrix_gpu;
1516       }
1517 #endif
1518     }
1519   }
1520   { /* log flops */
1521     double gops,time,perf,dev;
1522     HLibProfile::getHcompressPerf(gops,time,perf,dev);
1523 #if defined(PETSC_H2OPUS_USE_GPU)
1524     if (boundtocpu) {
1525       PetscCall(PetscLogFlops(1e9*gops));
1526     } else {
1527       PetscCall(PetscLogGpuFlops(1e9*gops));
1528     }
1529 #else
1530     PetscCall(PetscLogFlops(1e9*gops));
1531 #endif
1532   }
1533   PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0));
1534   PetscFunctionReturn(0);
1535 }
1536 
1537 /*@C
1538      MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.
1539 
1540    Input Parameters:
1541 +     A - the hierarchical matrix
1542 .     B - the matrix to be sampled
1543 .     bs - maximum number of samples to be taken concurrently
1544 -     tol - relative tolerance for construction
1545 
1546    Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix.
1547 
1548    Level: intermediate
1549 
1550 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`
1551 @*/
1552 PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1553 {
1554   PetscBool      ish2opus;
1555 
1556   PetscFunctionBegin;
1557   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1558   PetscValidType(A,1);
1559   if (B) PetscValidHeaderSpecific(B,MAT_CLASSID,2);
1560   PetscValidLogicalCollectiveInt(A,bs,3);
1561   PetscValidLogicalCollectiveReal(A,tol,3);
1562   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1563   if (ish2opus) {
1564     Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1565 
1566     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1567     a->sampler->SetSamplingMat(B);
1568     if (bs > 0) a->bs = bs;
1569     if (tol > 0.) a->rtol = tol;
1570     delete a->kernel;
1571   }
1572   PetscFunctionReturn(0);
1573 }
1574 
1575 /*@C
1576      MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel.
1577 
1578    Input Parameters:
1579 +     comm - MPI communicator
1580 .     m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1581 .     n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1582 .     M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1583 .     N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1584 .     spacedim - dimension of the space coordinates
1585 .     coords - coordinates of the points
1586 .     cdist - whether or not coordinates are distributed
1587 .     kernel - computational kernel (or NULL)
1588 .     kernelctx - kernel context
1589 .     eta - admissibility condition tolerance
1590 .     leafsize - leaf size in cluster tree
1591 -     basisord - approximation order for Chebychev interpolation of low-rank blocks
1592 
1593    Output Parameter:
1594 .     nA - matrix
1595 
1596    Options Database Keys:
1597 +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1598 .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1599 .     -mat_h2opus_order <PetscInt> - Chebychev approximation order
1600 -     -mat_h2opus_normsamples <PetscInt> - Maximum number of samples to be used when estimating norms
1601 
1602    Level: intermediate
1603 
1604 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`
1605 @*/
1606 PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1607 {
1608   Mat            A;
1609   Mat_H2OPUS     *h2opus;
1610 #if defined(PETSC_H2OPUS_USE_GPU)
1611   PetscBool      iscpu = PETSC_FALSE;
1612 #else
1613   PetscBool      iscpu = PETSC_TRUE;
1614 #endif
1615 
1616   PetscFunctionBegin;
1617   PetscCheck(m == n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1618   PetscCall(MatCreate(comm,&A));
1619   PetscCall(MatSetSizes(A,m,n,M,N));
1620   PetscCheck(M == N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1621   PetscCall(MatSetType(A,MATH2OPUS));
1622   PetscCall(MatBindToCPU(A,iscpu));
1623   PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx));
1624 
1625   h2opus = (Mat_H2OPUS*)A->data;
1626   if (eta > 0.) h2opus->eta = eta;
1627   if (leafsize > 0) h2opus->leafsize = leafsize;
1628   if (basisord > 0) h2opus->basisord = basisord;
1629 
1630   *nA = A;
1631   PetscFunctionReturn(0);
1632 }
1633 
1634 /*@C
1635      MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator.
1636 
1637    Input Parameters:
1638 +     B - the matrix to be sampled
1639 .     spacedim - dimension of the space coordinates
1640 .     coords - coordinates of the points
1641 .     cdist - whether or not coordinates are distributed
1642 .     eta - admissibility condition tolerance
1643 .     leafsize - leaf size in cluster tree
1644 .     maxrank - maximum rank allowed
1645 .     bs - maximum number of samples to be taken concurrently
1646 -     rtol - relative tolerance for construction
1647 
1648    Output Parameter:
1649 .     nA - matrix
1650 
1651    Options Database Keys:
1652 +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1653 .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1654 .     -mat_h2opus_maxrank <PetscInt> - Maximum rank when constructed from matvecs
1655 .     -mat_h2opus_samples <PetscInt> - Maximum number of samples to be taken concurrently when constructing from matvecs
1656 .     -mat_h2opus_rtol <PetscReal> - Relative tolerance for construction from sampling
1657 .     -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd()
1658 .     -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction
1659 -     -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms
1660 
1661    Notes: not available in parallel
1662 
1663    Level: intermediate
1664 
1665 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromKernel()`
1666 @*/
1667 PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1668 {
1669   Mat            A;
1670   Mat_H2OPUS     *h2opus;
1671   MPI_Comm       comm;
1672   PetscBool      boundtocpu = PETSC_TRUE;
1673 
1674   PetscFunctionBegin;
1675   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
1676   PetscValidLogicalCollectiveInt(B,spacedim,2);
1677   PetscValidLogicalCollectiveBool(B,cdist,4);
1678   PetscValidLogicalCollectiveReal(B,eta,5);
1679   PetscValidLogicalCollectiveInt(B,leafsize,6);
1680   PetscValidLogicalCollectiveInt(B,maxrank,7);
1681   PetscValidLogicalCollectiveInt(B,bs,8);
1682   PetscValidLogicalCollectiveReal(B,rtol,9);
1683   PetscValidPointer(nA,10);
1684   PetscCall(PetscObjectGetComm((PetscObject)B,&comm));
1685   PetscCheck(B->rmap->n == B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1686   PetscCheck(B->rmap->N == B->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1687   PetscCall(MatCreate(comm,&A));
1688   PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N));
1689 #if defined(PETSC_H2OPUS_USE_GPU)
1690   {
1691     PetscBool iscuda;
1692     VecType   vtype;
1693 
1694     PetscCall(MatGetVecType(B,&vtype));
1695     PetscCall(PetscStrcmp(vtype,VECCUDA,&iscuda));
1696     if (!iscuda) {
1697       PetscCall(PetscStrcmp(vtype,VECSEQCUDA,&iscuda));
1698       if (!iscuda) {
1699         PetscCall(PetscStrcmp(vtype,VECMPICUDA,&iscuda));
1700       }
1701     }
1702     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1703   }
1704 #endif
1705   PetscCall(MatSetType(A,MATH2OPUS));
1706   PetscCall(MatBindToCPU(A,boundtocpu));
1707   if (spacedim) {
1708     PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL));
1709   }
1710   PetscCall(MatPropagateSymmetryOptions(B,A));
1711   /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1712 
1713   h2opus = (Mat_H2OPUS*)A->data;
1714   h2opus->sampler = new PetscMatrixSampler(B);
1715   if (eta > 0.) h2opus->eta = eta;
1716   if (leafsize > 0) h2opus->leafsize = leafsize;
1717   if (maxrank > 0) h2opus->max_rank = maxrank;
1718   if (bs > 0) h2opus->bs = bs;
1719   if (rtol > 0.) h2opus->rtol = rtol;
1720   *nA = A;
1721   A->preallocated = PETSC_TRUE;
1722   PetscFunctionReturn(0);
1723 }
1724 
1725 /*@C
1726      MatH2OpusGetIndexMap - Access reordering index set.
1727 
1728    Input Parameters:
1729 .     A - the matrix
1730 
1731    Output Parameter:
1732 .     indexmap - the index set for the reordering
1733 
1734    Level: intermediate
1735 
1736 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1737 @*/
1738 PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1739 {
1740   PetscBool      ish2opus;
1741   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1742 
1743   PetscFunctionBegin;
1744   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1745   PetscValidType(A,1);
1746   PetscValidPointer(indexmap,2);
1747   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1748   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1749   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1750   *indexmap = a->h2opus_indexmap;
1751   PetscFunctionReturn(0);
1752 }
1753 
1754 /*@C
1755      MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1756 
1757    Input Parameters:
1758 +     A - the matrix
1759 .     nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1760 -     in - the vector to be mapped
1761 
1762    Output Parameter:
1763 .     out - the newly created mapped vector
1764 
1765    Level: intermediate
1766 
1767 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`
1768 @*/
1769 PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out)
1770 {
1771   PetscBool      ish2opus;
1772   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1773   PetscScalar    *xin,*xout;
1774   PetscBool      nm;
1775 
1776   PetscFunctionBegin;
1777   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1778   PetscValidType(A,1);
1779   PetscValidLogicalCollectiveBool(A,nativetopetsc,2);
1780   PetscValidHeaderSpecific(in,VEC_CLASSID,3);
1781   PetscValidPointer(out,4);
1782   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1783   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1784   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1785   nm   = a->nativemult;
1786   PetscCall(MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc));
1787   PetscCall(MatCreateVecs(A,out,NULL));
1788   PetscCall(MatH2OpusSetNativeMult(A,nm));
1789   if (!a->sf) { /* same ordering */
1790     PetscCall(VecCopy(in,*out));
1791     PetscFunctionReturn(0);
1792   }
1793   PetscCall(VecGetArrayRead(in,(const PetscScalar**)&xin));
1794   PetscCall(VecGetArrayWrite(*out,&xout));
1795   if (nativetopetsc) {
1796     PetscCall(PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1797     PetscCall(PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1798   } else {
1799     PetscCall(PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1800     PetscCall(PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1801   }
1802   PetscCall(VecRestoreArrayRead(in,(const PetscScalar**)&xin));
1803   PetscCall(VecRestoreArrayWrite(*out,&xout));
1804   PetscFunctionReturn(0);
1805 }
1806 
1807 /*@C
1808      MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T
1809 
1810    Input Parameters:
1811 +     A - the hierarchical matrix
1812 .     s - the scaling factor
1813 .     U - the dense low-rank update matrix
1814 -     V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed)
1815 
1816    Notes: The U and V matrices must be in dense format
1817 
1818    Level: intermediate
1819 
1820 .seealso: `MatCreate()`, `MATH2OPUS`, `MatCreateH2OpusFromMat()`, `MatCreateH2OpusFromKernel()`, `MatH2OpusCompress()`, `MatH2OpusOrthogonalize()`, `MATDENSE`
1821 @*/
1822 PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1823 {
1824   PetscBool      flg;
1825 
1826   PetscFunctionBegin;
1827   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1828   PetscValidType(A,1);
1829   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1830   PetscValidHeaderSpecific(U,MAT_CLASSID,2);
1831   PetscCheckSameComm(A,1,U,2);
1832   if (V) {
1833     PetscValidHeaderSpecific(V,MAT_CLASSID,3);
1834     PetscCheckSameComm(A,1,V,3);
1835   }
1836   PetscValidLogicalCollectiveScalar(A,s,4);
1837 
1838   if (!V) V = U;
1839   PetscCheck(U->cmap->N == V->cmap->N,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT,U->cmap->N,V->cmap->N);
1840   if (!U->cmap->N) PetscFunctionReturn(0);
1841   PetscCall(PetscLayoutCompare(U->rmap,A->rmap,&flg));
1842   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A and U must have the same row layout");
1843   PetscCall(PetscLayoutCompare(V->rmap,A->cmap,&flg));
1844   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A column layout must match V row column layout");
1845   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&flg));
1846   if (flg) {
1847     Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
1848     const PetscScalar *u,*v,*uu,*vv;
1849     PetscInt          ldu,ldv;
1850     PetscMPIInt       size;
1851 #if defined(H2OPUS_USE_MPI)
1852     h2opusHandle_t    handle = a->handle->handle;
1853 #else
1854     h2opusHandle_t    handle = a->handle;
1855 #endif
1856     PetscBool         usesf = (PetscBool)(a->sf && !a->nativemult);
1857     PetscSF           usf,vsf;
1858 
1859     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1860     PetscCheck(size <= 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet implemented in parallel");
1861     PetscCall(PetscLogEventBegin(MAT_H2Opus_LR,A,0,0,0));
1862     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U,&flg,MATSEQDENSE,MATMPIDENSE,""));
1863     PetscCheck(flg,PetscObjectComm((PetscObject)U),PETSC_ERR_SUP,"Not for U of type %s",((PetscObject)U)->type_name);
1864     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V,&flg,MATSEQDENSE,MATMPIDENSE,""));
1865     PetscCheck(flg,PetscObjectComm((PetscObject)V),PETSC_ERR_SUP,"Not for V of type %s",((PetscObject)V)->type_name);
1866     PetscCall(MatDenseGetLDA(U,&ldu));
1867     PetscCall(MatDenseGetLDA(V,&ldv));
1868     PetscCall(MatBoundToCPU(A,&flg));
1869     if (usesf) {
1870       PetscInt n;
1871 
1872       PetscCall(MatDenseGetH2OpusVectorSF(U,a->sf,&usf));
1873       PetscCall(MatDenseGetH2OpusVectorSF(V,a->sf,&vsf));
1874       PetscCall(MatH2OpusResizeBuffers_Private(A,U->cmap->N,V->cmap->N));
1875       PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL));
1876       ldu = n;
1877       ldv = n;
1878     }
1879     if (flg) {
1880       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1881       PetscCall(MatDenseGetArrayRead(U,&u));
1882       PetscCall(MatDenseGetArrayRead(V,&v));
1883       if (usesf) {
1884         vv = MatH2OpusGetThrustPointer(*a->yy);
1885         PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1886         PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1887         if (U != V) {
1888           uu = MatH2OpusGetThrustPointer(*a->xx);
1889           PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1890           PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1891         } else uu = vv;
1892       } else { uu = u; vv = v; }
1893       hlru_global(*a->hmatrix,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1894       PetscCall(MatDenseRestoreArrayRead(U,&u));
1895       PetscCall(MatDenseRestoreArrayRead(V,&v));
1896     } else {
1897 #if defined(PETSC_H2OPUS_USE_GPU)
1898       PetscBool flgU, flgV;
1899 
1900       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1901       PetscCall(PetscObjectTypeCompareAny((PetscObject)U,&flgU,MATSEQDENSE,MATMPIDENSE,""));
1902       if (flgU) PetscCall(MatConvert(U,MATDENSECUDA,MAT_INPLACE_MATRIX,&U));
1903       PetscCall(PetscObjectTypeCompareAny((PetscObject)V,&flgV,MATSEQDENSE,MATMPIDENSE,""));
1904       if (flgV) PetscCall(MatConvert(V,MATDENSECUDA,MAT_INPLACE_MATRIX,&V));
1905       PetscCall(MatDenseCUDAGetArrayRead(U,&u));
1906       PetscCall(MatDenseCUDAGetArrayRead(V,&v));
1907       if (usesf) {
1908         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1909         PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1910         PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1911         if (U != V) {
1912           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1913           PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1914           PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1915         } else uu = vv;
1916       } else { uu = u; vv = v; }
1917 #else
1918       SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"This should not happen");
1919 #endif
1920       hlru_global(*a->hmatrix_gpu,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1921 #if defined(PETSC_H2OPUS_USE_GPU)
1922       PetscCall(MatDenseCUDARestoreArrayRead(U,&u));
1923       PetscCall(MatDenseCUDARestoreArrayRead(V,&v));
1924       if (flgU) PetscCall(MatConvert(U,MATDENSE,MAT_INPLACE_MATRIX,&U));
1925       if (flgV) PetscCall(MatConvert(V,MATDENSE,MAT_INPLACE_MATRIX,&V));
1926 #endif
1927     }
1928     PetscCall(PetscLogEventEnd(MAT_H2Opus_LR,A,0,0,0));
1929     a->orthogonal = PETSC_FALSE;
1930   }
1931   PetscFunctionReturn(0);
1932 }
1933 #endif
1934