xref: /petsc/src/mat/impls/h2opus/cuda/math2opus.cu (revision c3b5f7ba6bc5ce25a01a67bb37ba5d34b02bbbd7)
1 #include <h2opusconf.h>
2 /* skip compilation of this .cu file if H2OPUS is CPU only while PETSc has GPU support */
3 #if !defined(__CUDACC__) || defined(H2OPUS_USE_GPU)
4 #include <h2opus.h>
5 #if defined(H2OPUS_USE_MPI)
6 #include <h2opus/distributed/distributed_h2opus_handle.h>
7 #include <h2opus/distributed/distributed_geometric_construction.h>
8 #include <h2opus/distributed/distributed_hgemv.h>
9 #include <h2opus/distributed/distributed_horthog.h>
10 #include <h2opus/distributed/distributed_hcompress.h>
11 #endif
12 #include <h2opus/util/boxentrygen.h>
13 #include <petsc/private/matimpl.h>
14 #include <petsc/private/vecimpl.h>
15 #include <petsc/private/deviceimpl.h>
16 #include <petscsf.h>
17 
18 /* math2opusutils */
19 PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF,PetscInt,PetscInt,PetscInt,PetscSF*);
20 PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat,PetscSF,PetscSF*);
21 PETSC_INTERN PetscErrorCode VecSign(Vec,Vec);
22 PETSC_INTERN PetscErrorCode VecSetDelta(Vec,PetscInt);
23 PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat,NormType,PetscInt,PetscReal*);
24 
25 #define MatH2OpusGetThrustPointer(v) thrust::raw_pointer_cast((v).data())
26 
27 /* Use GPU only if H2OPUS is configured for GPU */
28 #if defined(PETSC_HAVE_CUDA) && defined(H2OPUS_USE_GPU)
29 #define PETSC_H2OPUS_USE_GPU
30 #endif
31 #if defined(PETSC_H2OPUS_USE_GPU)
32 #define MatH2OpusUpdateIfNeeded(A,B) MatBindToCPU(A,(PetscBool)((A)->boundtocpu || (B)))
33 #else
34 #define MatH2OpusUpdateIfNeeded(A,B) 0
35 #endif
36 
37 // TODO H2OPUS:
38 // DistributedHMatrix
39 //   unsymmetric ?
40 //   transpose for distributed_hgemv?
41 //   clearData()
42 // Unify interface for sequential and parallel?
43 // Reuse geometric construction (almost possible, only the unsymmetric case is explicitly handled)
44 //
45 template <class T> class PetscPointCloud : public H2OpusDataSet<T>
46 {
47   private:
48     int dimension;
49     size_t num_points;
50     std::vector<T> pts;
51 
52   public:
53     PetscPointCloud(int dim, size_t num_pts, const T coords[])
54     {
55       dim = dim > 0 ? dim : 1;
56       this->dimension = dim;
57       this->num_points = num_pts;
58 
59       pts.resize(num_pts*dim);
60       if (coords) {
61         for (size_t n = 0; n < num_points; n++)
62           for (int i = 0; i < dim; i++)
63             pts[n*dim + i] = coords[n*dim + i];
64       } else {
65         PetscReal h = 1./(num_points - 1);
66         for (size_t n = 0; n < num_points; n++)
67           for (int i = 0; i < dim; i++)
68             pts[n*dim + i] = i*h;
69       }
70     }
71 
72     PetscPointCloud(const PetscPointCloud<T>& other)
73     {
74       size_t N = other.dimension * other.num_points;
75       this->dimension = other.dimension;
76       this->num_points = other.num_points;
77       this->pts.resize(N);
78       for (size_t i = 0; i < N; i++)
79         this->pts[i] = other.pts[i];
80     }
81 
82     int getDimension() const
83     {
84         return dimension;
85     }
86 
87     size_t getDataSetSize() const
88     {
89         return num_points;
90     }
91 
92     T getDataPoint(size_t idx, int dim) const
93     {
94         assert(dim < dimension && idx < num_points);
95         return pts[idx*dimension + dim];
96     }
97 
98     void Print(std::ostream& out = std::cout)
99     {
100        out << "Dimension: " << dimension << std::endl;
101        out << "NumPoints: " << num_points << std::endl;
102        for (size_t n = 0; n < num_points; n++) {
103          for (int d = 0; d < dimension; d++)
104            out << pts[n*dimension + d] << " ";
105          out << std::endl;
106        }
107     }
108 };
109 
110 template<class T> class PetscFunctionGenerator
111 {
112 private:
113   MatH2OpusKernel k;
114   int             dim;
115   void            *ctx;
116 
117 public:
118     PetscFunctionGenerator(MatH2OpusKernel k, int dim, void* ctx) { this->k = k; this->dim = dim; this->ctx = ctx; }
119     PetscFunctionGenerator(PetscFunctionGenerator& other) { this->k = other.k; this->dim = other.dim; this->ctx = other.ctx; }
120     T operator()(PetscReal *pt1, PetscReal *pt2)
121     {
122         return (T)((*this->k)(this->dim,pt1,pt2,this->ctx));
123     }
124 };
125 
126 #include <../src/mat/impls/h2opus/math2opussampler.hpp>
127 
128 /* just to not clutter the code */
129 #if !defined(H2OPUS_USE_GPU)
130 typedef HMatrix HMatrix_GPU;
131 #if defined(H2OPUS_USE_MPI)
132 typedef DistributedHMatrix DistributedHMatrix_GPU;
133 #endif
134 #endif
135 
136 typedef struct {
137 #if defined(H2OPUS_USE_MPI)
138   distributedH2OpusHandle_t handle;
139 #else
140   h2opusHandle_t handle;
141 #endif
142   /* Sequential and parallel matrices are two different classes at the moment */
143   HMatrix *hmatrix;
144 #if defined(H2OPUS_USE_MPI)
145   DistributedHMatrix *dist_hmatrix;
146 #else
147   HMatrix *dist_hmatrix; /* just to not clutter the code */
148 #endif
149   /* May use permutations */
150   PetscSF sf;
151   PetscLayout h2opus_rmap, h2opus_cmap;
152   IS h2opus_indexmap;
153   thrust::host_vector<PetscScalar> *xx,*yy;
154   PetscInt xxs,yys;
155   PetscBool multsetup;
156 
157   /* GPU */
158   HMatrix_GPU *hmatrix_gpu;
159 #if defined(H2OPUS_USE_MPI)
160   DistributedHMatrix_GPU *dist_hmatrix_gpu;
161 #else
162   HMatrix_GPU *dist_hmatrix_gpu; /* just to not clutter the code */
163 #endif
164 #if defined(PETSC_H2OPUS_USE_GPU)
165   thrust::device_vector<PetscScalar> *xx_gpu,*yy_gpu;
166   PetscInt xxs_gpu,yys_gpu;
167 #endif
168 
169   /* construction from matvecs */
170   PetscMatrixSampler* sampler;
171   PetscBool nativemult;
172 
173   /* Admissibility */
174   PetscReal eta;
175   PetscInt  leafsize;
176 
177   /* for dof reordering */
178   PetscPointCloud<PetscReal> *ptcloud;
179 
180   /* kernel for generating matrix entries */
181   PetscFunctionGenerator<PetscScalar> *kernel;
182 
183   /* basis orthogonalized? */
184   PetscBool orthogonal;
185 
186   /* customization */
187   PetscInt  basisord;
188   PetscInt  max_rank;
189   PetscInt  bs;
190   PetscReal rtol;
191   PetscInt  norm_max_samples;
192   PetscBool check_construction;
193   PetscBool hara_verbose;
194 
195   /* keeps track of MatScale values */
196   PetscScalar s;
197 } Mat_H2OPUS;
198 
199 static PetscErrorCode MatDestroy_H2OPUS(Mat A)
200 {
201   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
202 
203   PetscFunctionBegin;
204 #if defined(H2OPUS_USE_MPI)
205   h2opusDestroyDistributedHandle(a->handle);
206 #else
207   h2opusDestroyHandle(a->handle);
208 #endif
209   delete a->dist_hmatrix;
210   delete a->hmatrix;
211   PetscCall(PetscSFDestroy(&a->sf));
212   PetscCall(PetscLayoutDestroy(&a->h2opus_rmap));
213   PetscCall(PetscLayoutDestroy(&a->h2opus_cmap));
214   PetscCall(ISDestroy(&a->h2opus_indexmap));
215   delete a->xx;
216   delete a->yy;
217   delete a->hmatrix_gpu;
218   delete a->dist_hmatrix_gpu;
219 #if defined(PETSC_H2OPUS_USE_GPU)
220   delete a->xx_gpu;
221   delete a->yy_gpu;
222 #endif
223   delete a->sampler;
224   delete a->ptcloud;
225   delete a->kernel;
226   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",NULL));
227   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",NULL));
228   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",NULL));
229   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",NULL));
230   PetscCall(PetscObjectChangeTypeName((PetscObject)A,NULL));
231   PetscCall(PetscFree(A->data));
232   PetscFunctionReturn(0);
233 }
234 
235 PetscErrorCode MatH2OpusSetNativeMult(Mat A, PetscBool nm)
236 {
237   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
238   PetscBool      ish2opus;
239 
240   PetscFunctionBegin;
241   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
242   PetscValidLogicalCollectiveBool(A,nm,2);
243   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
244   if (ish2opus) {
245     if (a->h2opus_rmap) { /* need to swap layouts for vector creation */
246       if ((!a->nativemult && nm) || (a->nativemult && !nm)) {
247         PetscLayout t;
248         t = A->rmap;
249         A->rmap = a->h2opus_rmap;
250         a->h2opus_rmap = t;
251         t = A->cmap;
252         A->cmap = a->h2opus_cmap;
253         a->h2opus_cmap = t;
254       }
255     }
256     a->nativemult = nm;
257   }
258   PetscFunctionReturn(0);
259 }
260 
261 PetscErrorCode MatH2OpusGetNativeMult(Mat A, PetscBool *nm)
262 {
263   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
264   PetscBool      ish2opus;
265 
266   PetscFunctionBegin;
267   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
268   PetscValidPointer(nm,2);
269   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
270   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
271   *nm = a->nativemult;
272   PetscFunctionReturn(0);
273 }
274 
275 PETSC_EXTERN PetscErrorCode MatNorm_H2OPUS(Mat A, NormType normtype, PetscReal* n)
276 {
277   PetscBool      ish2opus;
278   PetscInt       nmax = PETSC_DECIDE;
279   Mat_H2OPUS     *a = NULL;
280   PetscBool      mult = PETSC_FALSE;
281 
282   PetscFunctionBegin;
283   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
284   if (ish2opus) { /* set userdefine number of samples and fastpath for mult (norms are order independent) */
285     a = (Mat_H2OPUS*)A->data;
286 
287     nmax = a->norm_max_samples;
288     mult = a->nativemult;
289     PetscCall(MatH2OpusSetNativeMult(A,PETSC_TRUE));
290   } else {
291     PetscCall(PetscOptionsGetInt(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_approximate_norm_samples",&nmax,NULL));
292   }
293   PetscCall(MatApproximateNorm_Private(A,normtype,nmax,n));
294   if (a) PetscCall(MatH2OpusSetNativeMult(A,mult));
295   PetscFunctionReturn(0);
296 }
297 
298 static PetscErrorCode MatH2OpusResizeBuffers_Private(Mat A, PetscInt xN, PetscInt yN)
299 {
300   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
301   PetscInt       n;
302   PetscBool      boundtocpu = PETSC_TRUE;
303 
304   PetscFunctionBegin;
305 #if defined(PETSC_H2OPUS_USE_GPU)
306   boundtocpu = A->boundtocpu;
307 #endif
308   PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
309   if (boundtocpu) {
310     if (h2opus->xxs < xN) { h2opus->xx->resize(n*xN); h2opus->xxs = xN; }
311     if (h2opus->yys < yN) { h2opus->yy->resize(n*yN); h2opus->yys = yN; }
312   }
313 #if defined(PETSC_H2OPUS_USE_GPU)
314   if (!boundtocpu) {
315     if (h2opus->xxs_gpu < xN) { h2opus->xx_gpu->resize(n*xN); h2opus->xxs_gpu = xN; }
316     if (h2opus->yys_gpu < yN) { h2opus->yy_gpu->resize(n*yN); h2opus->yys_gpu = yN; }
317   }
318 #endif
319   PetscFunctionReturn(0);
320 }
321 
322 static PetscErrorCode MatMultNKernel_H2OPUS(Mat A, PetscBool transA, Mat B, Mat C)
323 {
324   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
325 #if defined(H2OPUS_USE_MPI)
326   h2opusHandle_t handle = h2opus->handle->handle;
327 #else
328   h2opusHandle_t handle = h2opus->handle;
329 #endif
330   PetscBool      boundtocpu = PETSC_TRUE;
331   PetscScalar    *xx,*yy,*uxx,*uyy;
332   PetscInt       blda,clda;
333   PetscMPIInt    size;
334   PetscSF        bsf,csf;
335   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
336 
337   PetscFunctionBegin;
338   HLibProfile::clear();
339 #if defined(PETSC_H2OPUS_USE_GPU)
340   boundtocpu = A->boundtocpu;
341 #endif
342   PetscCall(MatDenseGetLDA(B,&blda));
343   PetscCall(MatDenseGetLDA(C,&clda));
344   if (usesf) {
345     PetscInt n;
346 
347     PetscCall(MatDenseGetH2OpusVectorSF(B,h2opus->sf,&bsf));
348     PetscCall(MatDenseGetH2OpusVectorSF(C,h2opus->sf,&csf));
349 
350     PetscCall(MatH2OpusResizeBuffers_Private(A,B->cmap->N,C->cmap->N));
351     PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
352     blda = n;
353     clda = n;
354   }
355   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
356   if (boundtocpu) {
357     PetscCall(MatDenseGetArrayRead(B,(const PetscScalar**)&xx));
358     PetscCall(MatDenseGetArrayWrite(C,&yy));
359     if (usesf) {
360       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx);
361       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy);
362       PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
363       PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
364     } else {
365       uxx = xx;
366       uyy = yy;
367     }
368     if (size > 1) {
369       PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
370       PetscCheckFalse(transA && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
371 #if defined(H2OPUS_USE_MPI)
372       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
373 #endif
374     } else {
375       PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
376       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
377     }
378     PetscCall(MatDenseRestoreArrayRead(B,(const PetscScalar**)&xx));
379     if (usesf) {
380       PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
381       PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
382     }
383     PetscCall(MatDenseRestoreArrayWrite(C,&yy));
384 #if defined(PETSC_H2OPUS_USE_GPU)
385   } else {
386     PetscBool ciscuda,biscuda;
387 
388     /* If not of type seqdensecuda, convert on the fly (i.e. allocate GPU memory) */
389     PetscCall(PetscObjectTypeCompareAny((PetscObject)B,&biscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
390     if (!biscuda) {
391       PetscCall(MatConvert(B,MATDENSECUDA,MAT_INPLACE_MATRIX,&B));
392     }
393     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&ciscuda,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
394     if (!ciscuda) {
395       C->assembled = PETSC_TRUE;
396       PetscCall(MatConvert(C,MATDENSECUDA,MAT_INPLACE_MATRIX,&C));
397     }
398     PetscCall(MatDenseCUDAGetArrayRead(B,(const PetscScalar**)&xx));
399     PetscCall(MatDenseCUDAGetArrayWrite(C,&yy));
400     if (usesf) {
401       uxx  = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
402       uyy  = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
403       PetscCall(PetscSFBcastBegin(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
404       PetscCall(PetscSFBcastEnd(bsf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
405     } else {
406       uxx = xx;
407       uyy = yy;
408     }
409     PetscCall(PetscLogGpuTimeBegin());
410     if (size > 1) {
411       PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
412       PetscCheckFalse(transA && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
413 #if defined(H2OPUS_USE_MPI)
414       distributed_hgemv(/* transA ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, h2opus->handle);
415 #endif
416     } else {
417       PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
418       hgemv(transA ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, blda, 0.0, uyy, clda, B->cmap->N, handle);
419     }
420     PetscCall(PetscLogGpuTimeEnd());
421     PetscCall(MatDenseCUDARestoreArrayRead(B,(const PetscScalar**)&xx));
422     if (usesf) {
423       PetscCall(PetscSFReduceBegin(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
424       PetscCall(PetscSFReduceEnd(csf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
425     }
426     PetscCall(MatDenseCUDARestoreArrayWrite(C,&yy));
427     if (!biscuda) {
428       PetscCall(MatConvert(B,MATDENSE,MAT_INPLACE_MATRIX,&B));
429     }
430     if (!ciscuda) {
431       PetscCall(MatConvert(C,MATDENSE,MAT_INPLACE_MATRIX,&C));
432     }
433 #endif
434   }
435   { /* log flops */
436     double gops,time,perf,dev;
437     HLibProfile::getHgemvPerf(gops,time,perf,dev);
438 #if defined(PETSC_H2OPUS_USE_GPU)
439     if (boundtocpu) {
440       PetscCall(PetscLogFlops(1e9*gops));
441     } else {
442       PetscCall(PetscLogGpuFlops(1e9*gops));
443     }
444 #else
445     PetscCall(PetscLogFlops(1e9*gops));
446 #endif
447   }
448   PetscFunctionReturn(0);
449 }
450 
451 static PetscErrorCode MatProductNumeric_H2OPUS(Mat C)
452 {
453   Mat_Product    *product = C->product;
454 
455   PetscFunctionBegin;
456   MatCheckProduct(C,1);
457   switch (product->type) {
458   case MATPRODUCT_AB:
459     PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_FALSE,product->B,C));
460     break;
461   case MATPRODUCT_AtB:
462     PetscCall(MatMultNKernel_H2OPUS(product->A,PETSC_TRUE,product->B,C));
463     break;
464   default:
465     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
466   }
467   PetscFunctionReturn(0);
468 }
469 
470 static PetscErrorCode MatProductSymbolic_H2OPUS(Mat C)
471 {
472   Mat_Product    *product = C->product;
473   PetscBool      cisdense;
474   Mat            A,B;
475 
476   PetscFunctionBegin;
477   MatCheckProduct(C,1);
478   A = product->A;
479   B = product->B;
480   switch (product->type) {
481   case MATPRODUCT_AB:
482     PetscCall(MatSetSizes(C,A->rmap->n,B->cmap->n,A->rmap->N,B->cmap->N));
483     PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B));
484     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
485     if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name));
486     PetscCall(MatSetUp(C));
487     break;
488   case MATPRODUCT_AtB:
489     PetscCall(MatSetSizes(C,A->cmap->n,B->cmap->n,A->cmap->N,B->cmap->N));
490     PetscCall(MatSetBlockSizesFromMats(C,product->A,product->B));
491     PetscCall(PetscObjectTypeCompareAny((PetscObject)C,&cisdense,MATSEQDENSE,MATMPIDENSE,MATSEQDENSECUDA,MATMPIDENSECUDA,""));
492     if (!cisdense) PetscCall(MatSetType(C,((PetscObject)product->B)->type_name));
493     PetscCall(MatSetUp(C));
494     break;
495   default:
496     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatProduct type %s is not supported",MatProductTypes[product->type]);
497   }
498   C->ops->productsymbolic = NULL;
499   C->ops->productnumeric = MatProductNumeric_H2OPUS;
500   PetscFunctionReturn(0);
501 }
502 
503 static PetscErrorCode MatProductSetFromOptions_H2OPUS(Mat C)
504 {
505   PetscFunctionBegin;
506   MatCheckProduct(C,1);
507   if (C->product->type == MATPRODUCT_AB || C->product->type == MATPRODUCT_AtB) {
508     C->ops->productsymbolic = MatProductSymbolic_H2OPUS;
509   }
510   PetscFunctionReturn(0);
511 }
512 
513 static PetscErrorCode MatMultKernel_H2OPUS(Mat A, Vec x, PetscScalar sy, Vec y, PetscBool trans)
514 {
515   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
516 #if defined(H2OPUS_USE_MPI)
517   h2opusHandle_t handle = h2opus->handle->handle;
518 #else
519   h2opusHandle_t handle = h2opus->handle;
520 #endif
521   PetscBool      boundtocpu = PETSC_TRUE;
522   PetscInt       n;
523   PetscScalar    *xx,*yy,*uxx,*uyy;
524   PetscMPIInt    size;
525   PetscBool      usesf = (PetscBool)(h2opus->sf && !h2opus->nativemult);
526 
527   PetscFunctionBegin;
528   HLibProfile::clear();
529   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
530 #if defined(PETSC_H2OPUS_USE_GPU)
531   boundtocpu = A->boundtocpu;
532 #endif
533   if (usesf) {
534     PetscCall(PetscSFGetGraph(h2opus->sf,NULL,&n,NULL,NULL));
535   } else n = A->rmap->n;
536   if (boundtocpu) {
537     PetscCall(VecGetArrayRead(x,(const PetscScalar**)&xx));
538     if (sy == 0.0) {
539       PetscCall(VecGetArrayWrite(y,&yy));
540     } else {
541       PetscCall(VecGetArray(y,&yy));
542     }
543     if (usesf) {
544       uxx = MatH2OpusGetThrustPointer(*h2opus->xx);
545       uyy = MatH2OpusGetThrustPointer(*h2opus->yy);
546 
547       PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
548       PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
549       if (sy != 0.0) {
550         PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
551         PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
552       }
553     } else {
554       uxx = xx;
555       uyy = yy;
556     }
557     if (size > 1) {
558       PetscCheck(h2opus->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
559       PetscCheckFalse(trans && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
560 #if defined(H2OPUS_USE_MPI)
561       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix, uxx, n, sy, uyy, n, 1, h2opus->handle);
562 #endif
563     } else {
564       PetscCheck(h2opus->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
565       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix, uxx, n, sy, uyy, n, 1, handle);
566     }
567     PetscCall(VecRestoreArrayRead(x,(const PetscScalar**)&xx));
568     if (usesf) {
569       PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
570       PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
571     }
572     if (sy == 0.0) {
573       PetscCall(VecRestoreArrayWrite(y,&yy));
574     } else {
575       PetscCall(VecRestoreArray(y,&yy));
576     }
577 #if defined(PETSC_H2OPUS_USE_GPU)
578   } else {
579     PetscCall(VecCUDAGetArrayRead(x,(const PetscScalar**)&xx));
580     if (sy == 0.0) {
581       PetscCall(VecCUDAGetArrayWrite(y,&yy));
582     } else {
583       PetscCall(VecCUDAGetArray(y,&yy));
584     }
585     if (usesf) {
586       uxx = MatH2OpusGetThrustPointer(*h2opus->xx_gpu);
587       uyy = MatH2OpusGetThrustPointer(*h2opus->yy_gpu);
588 
589       PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
590       PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,xx,uxx,MPI_REPLACE));
591       if (sy != 0.0) {
592         PetscCall(PetscSFBcastBegin(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
593         PetscCall(PetscSFBcastEnd(h2opus->sf,MPIU_SCALAR,yy,uyy,MPI_REPLACE));
594       }
595     } else {
596       uxx = xx;
597       uyy = yy;
598     }
599     PetscCall(PetscLogGpuTimeBegin());
600     if (size > 1) {
601       PetscCheck(h2opus->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed GPU matrix");
602       PetscCheckFalse(trans && !A->symmetric,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"MatMultTranspose not yet coded in parallel");
603 #if defined(H2OPUS_USE_MPI)
604       distributed_hgemv(/*trans ? H2Opus_Trans : H2Opus_NoTrans, */h2opus->s, *h2opus->dist_hmatrix_gpu, uxx, n, sy, uyy, n, 1, h2opus->handle);
605 #endif
606     } else {
607       PetscCheck(h2opus->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
608       hgemv(trans ? H2Opus_Trans : H2Opus_NoTrans, h2opus->s, *h2opus->hmatrix_gpu, uxx, n, sy, uyy, n, 1, handle);
609     }
610     PetscCall(PetscLogGpuTimeEnd());
611     PetscCall(VecCUDARestoreArrayRead(x,(const PetscScalar**)&xx));
612     if (usesf) {
613       PetscCall(PetscSFReduceBegin(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
614       PetscCall(PetscSFReduceEnd(h2opus->sf,MPIU_SCALAR,uyy,yy,MPI_REPLACE));
615     }
616     if (sy == 0.0) {
617       PetscCall(VecCUDARestoreArrayWrite(y,&yy));
618     } else {
619       PetscCall(VecCUDARestoreArray(y,&yy));
620     }
621 #endif
622   }
623   { /* log flops */
624     double gops,time,perf,dev;
625     HLibProfile::getHgemvPerf(gops,time,perf,dev);
626 #if defined(PETSC_H2OPUS_USE_GPU)
627     if (boundtocpu) {
628       PetscCall(PetscLogFlops(1e9*gops));
629     } else {
630       PetscCall(PetscLogGpuFlops(1e9*gops));
631     }
632 #else
633     PetscCall(PetscLogFlops(1e9*gops));
634 #endif
635   }
636   PetscFunctionReturn(0);
637 }
638 
639 static PetscErrorCode MatMultTranspose_H2OPUS(Mat A, Vec x, Vec y)
640 {
641   PetscBool      xiscuda,yiscuda;
642 
643   PetscFunctionBegin;
644   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
645   PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,""));
646   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda));
647   PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_TRUE));
648   PetscFunctionReturn(0);
649 }
650 
651 static PetscErrorCode MatMult_H2OPUS(Mat A, Vec x, Vec y)
652 {
653   PetscBool      xiscuda,yiscuda;
654 
655   PetscFunctionBegin;
656   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
657   PetscCall(PetscObjectTypeCompareAny((PetscObject)y,&yiscuda,VECSEQCUDA,VECMPICUDA,""));
658   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !yiscuda));
659   PetscCall(MatMultKernel_H2OPUS(A,x,0.0,y,PETSC_FALSE));
660   PetscFunctionReturn(0);
661 }
662 
663 static PetscErrorCode MatMultTransposeAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
664 {
665   PetscBool      xiscuda,ziscuda;
666 
667   PetscFunctionBegin;
668   PetscCall(VecCopy(y,z));
669   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
670   PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,""));
671   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda));
672   PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_TRUE));
673   PetscFunctionReturn(0);
674 }
675 
676 static PetscErrorCode MatMultAdd_H2OPUS(Mat A, Vec x, Vec y, Vec z)
677 {
678   PetscBool      xiscuda,ziscuda;
679 
680   PetscFunctionBegin;
681   PetscCall(VecCopy(y,z));
682   PetscCall(PetscObjectTypeCompareAny((PetscObject)x,&xiscuda,VECSEQCUDA,VECMPICUDA,""));
683   PetscCall(PetscObjectTypeCompareAny((PetscObject)z,&ziscuda,VECSEQCUDA,VECMPICUDA,""));
684   PetscCall(MatH2OpusUpdateIfNeeded(A,!xiscuda || !ziscuda));
685   PetscCall(MatMultKernel_H2OPUS(A,x,1.0,z,PETSC_FALSE));
686   PetscFunctionReturn(0);
687 }
688 
689 static PetscErrorCode MatScale_H2OPUS(Mat A, PetscScalar s)
690 {
691   Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
692 
693   PetscFunctionBegin;
694   a->s *= s;
695   PetscFunctionReturn(0);
696 }
697 
698 static PetscErrorCode MatSetFromOptions_H2OPUS(PetscOptionItems *PetscOptionsObject,Mat A)
699 {
700   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
701 
702   PetscFunctionBegin;
703   PetscCall(PetscOptionsHead(PetscOptionsObject,"H2OPUS options"));
704   PetscCall(PetscOptionsInt("-mat_h2opus_leafsize","Leaf size of cluster tree",NULL,a->leafsize,&a->leafsize,NULL));
705   PetscCall(PetscOptionsReal("-mat_h2opus_eta","Admissibility condition tolerance",NULL,a->eta,&a->eta,NULL));
706   PetscCall(PetscOptionsInt("-mat_h2opus_order","Basis order for off-diagonal sampling when constructed from kernel",NULL,a->basisord,&a->basisord,NULL));
707   PetscCall(PetscOptionsInt("-mat_h2opus_maxrank","Maximum rank when constructed from matvecs",NULL,a->max_rank,&a->max_rank,NULL));
708   PetscCall(PetscOptionsInt("-mat_h2opus_samples","Maximum number of samples to be taken concurrently when constructing from matvecs",NULL,a->bs,&a->bs,NULL));
709   PetscCall(PetscOptionsInt("-mat_h2opus_normsamples","Maximum bumber of samples to be when estimating norms",NULL,a->norm_max_samples,&a->norm_max_samples,NULL));
710   PetscCall(PetscOptionsReal("-mat_h2opus_rtol","Relative tolerance for construction from sampling",NULL,a->rtol,&a->rtol,NULL));
711   PetscCall(PetscOptionsBool("-mat_h2opus_check","Check error when constructing from sampling during MatAssemblyEnd()",NULL,a->check_construction,&a->check_construction,NULL));
712   PetscCall(PetscOptionsBool("-mat_h2opus_hara_verbose","Verbose output from hara construction",NULL,a->hara_verbose,&a->hara_verbose,NULL));
713   PetscCall(PetscOptionsTail());
714   PetscFunctionReturn(0);
715 }
716 
717 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat,PetscInt,const PetscReal[],PetscBool,MatH2OpusKernel,void*);
718 
719 static PetscErrorCode MatH2OpusInferCoordinates_Private(Mat A)
720 {
721   Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
722   Vec               c;
723   PetscInt          spacedim;
724   const PetscScalar *coords;
725 
726   PetscFunctionBegin;
727   if (a->ptcloud) PetscFunctionReturn(0);
728   PetscCall(PetscObjectQuery((PetscObject)A,"__math2opus_coords",(PetscObject*)&c));
729   if (!c && a->sampler) {
730     Mat S = a->sampler->GetSamplingMat();
731 
732     PetscCall(PetscObjectQuery((PetscObject)S,"__math2opus_coords",(PetscObject*)&c));
733   }
734   if (!c) {
735     PetscCall(MatH2OpusSetCoords_H2OPUS(A,-1,NULL,PETSC_FALSE,NULL,NULL));
736   } else {
737     PetscCall(VecGetArrayRead(c,&coords));
738     PetscCall(VecGetBlockSize(c,&spacedim));
739     PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,PETSC_FALSE,NULL,NULL));
740     PetscCall(VecRestoreArrayRead(c,&coords));
741   }
742   PetscFunctionReturn(0);
743 }
744 
745 static PetscErrorCode MatSetUpMultiply_H2OPUS(Mat A)
746 {
747   MPI_Comm       comm;
748   PetscMPIInt    size;
749   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
750   PetscInt       n = 0,*idx = NULL;
751   int            *iidx = NULL;
752   PetscCopyMode  own;
753   PetscBool      rid;
754 
755   PetscFunctionBegin;
756   if (a->multsetup) PetscFunctionReturn(0);
757   if (a->sf) { /* MatDuplicate_H2OPUS takes reference to the SF */
758     PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL));
759 #if defined(PETSC_H2OPUS_USE_GPU)
760     a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
761     a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
762     a->xxs_gpu = 1;
763     a->yys_gpu = 1;
764 #endif
765     a->xx  = new thrust::host_vector<PetscScalar>(n);
766     a->yy  = new thrust::host_vector<PetscScalar>(n);
767     a->xxs = 1;
768     a->yys = 1;
769   } else {
770     IS is;
771     PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
772     PetscCallMPI(MPI_Comm_size(comm,&size));
773     if (!a->h2opus_indexmap) {
774       if (size > 1) {
775         PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
776 #if defined(H2OPUS_USE_MPI)
777         iidx = MatH2OpusGetThrustPointer(a->dist_hmatrix->basis_tree.basis_branch.index_map);
778         n    = a->dist_hmatrix->basis_tree.basis_branch.index_map.size();
779 #endif
780       } else {
781         iidx = MatH2OpusGetThrustPointer(a->hmatrix->u_basis_tree.index_map);
782         n    = a->hmatrix->u_basis_tree.index_map.size();
783       }
784 
785       if (PetscDefined(USE_64BIT_INDICES)) {
786         PetscInt i;
787 
788         own  = PETSC_OWN_POINTER;
789         PetscCall(PetscMalloc1(n,&idx));
790         for (i=0;i<n;i++) idx[i] = iidx[i];
791       } else {
792         own  = PETSC_COPY_VALUES;
793         idx  = (PetscInt*)iidx;
794       }
795       PetscCall(ISCreateGeneral(comm,n,idx,own,&is));
796       PetscCall(ISSetPermutation(is));
797       PetscCall(ISViewFromOptions(is,(PetscObject)A,"-mat_h2opus_indexmap_view"));
798       a->h2opus_indexmap = is;
799     }
800     PetscCall(ISGetLocalSize(a->h2opus_indexmap,&n));
801     PetscCall(ISGetIndices(a->h2opus_indexmap,(const PetscInt **)&idx));
802     rid  = (PetscBool)(n == A->rmap->n);
803     PetscCall(MPIU_Allreduce(MPI_IN_PLACE,&rid,1,MPIU_BOOL,MPI_LAND,comm));
804     if (rid) {
805       PetscCall(ISIdentity(a->h2opus_indexmap,&rid));
806     }
807     if (!rid) {
808       if (size > 1) { /* Parallel distribution may be different, save it here for fast path in MatMult (see MatH2OpusSetNativeMult) */
809         PetscCall(PetscLayoutCreate(comm,&a->h2opus_rmap));
810         PetscCall(PetscLayoutSetLocalSize(a->h2opus_rmap,n));
811         PetscCall(PetscLayoutSetUp(a->h2opus_rmap));
812         PetscCall(PetscLayoutReference(a->h2opus_rmap,&a->h2opus_cmap));
813       }
814       PetscCall(PetscSFCreate(comm,&a->sf));
815       PetscCall(PetscSFSetGraphLayout(a->sf,A->rmap,n,NULL,PETSC_OWN_POINTER,idx));
816       PetscCall(PetscSFViewFromOptions(a->sf,(PetscObject)A,"-mat_h2opus_sf_view"));
817 #if defined(PETSC_H2OPUS_USE_GPU)
818       a->xx_gpu  = new thrust::device_vector<PetscScalar>(n);
819       a->yy_gpu  = new thrust::device_vector<PetscScalar>(n);
820       a->xxs_gpu = 1;
821       a->yys_gpu = 1;
822 #endif
823       a->xx  = new thrust::host_vector<PetscScalar>(n);
824       a->yy  = new thrust::host_vector<PetscScalar>(n);
825       a->xxs = 1;
826       a->yys = 1;
827     }
828     PetscCall(ISRestoreIndices(a->h2opus_indexmap,(const PetscInt **)&idx));
829   }
830   a->multsetup = PETSC_TRUE;
831   PetscFunctionReturn(0);
832 }
833 
834 static PetscErrorCode MatAssemblyEnd_H2OPUS(Mat A, MatAssemblyType assemblytype)
835 {
836   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
837 #if defined(H2OPUS_USE_MPI)
838   h2opusHandle_t handle = a->handle->handle;
839 #else
840   h2opusHandle_t handle = a->handle;
841 #endif
842   PetscBool      kernel = PETSC_FALSE;
843   PetscBool      boundtocpu = PETSC_TRUE;
844   PetscBool      samplingdone = PETSC_FALSE;
845   MPI_Comm       comm;
846   PetscMPIInt    size;
847 
848   PetscFunctionBegin;
849   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
850   PetscCheckFalse(A->rmap->n != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
851   PetscCheckFalse(A->rmap->N != A->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
852 
853   /* XXX */
854   a->leafsize = PetscMin(a->leafsize, PetscMin(A->rmap->N, A->cmap->N));
855 
856   PetscCallMPI(MPI_Comm_size(comm,&size));
857   /* TODO REUSABILITY of geometric construction */
858   delete a->hmatrix;
859   delete a->dist_hmatrix;
860 #if defined(PETSC_H2OPUS_USE_GPU)
861   delete a->hmatrix_gpu;
862   delete a->dist_hmatrix_gpu;
863 #endif
864   a->orthogonal = PETSC_FALSE;
865 
866   /* TODO: other? */
867   H2OpusBoxCenterAdmissibility adm(a->eta);
868 
869   PetscCall(PetscLogEventBegin(MAT_H2Opus_Build,A,0,0,0));
870   if (size > 1) {
871 #if defined(H2OPUS_USE_MPI)
872     a->dist_hmatrix = new DistributedHMatrix(A->rmap->n/* ,A->symmetric */);
873 #else
874     a->dist_hmatrix = NULL;
875 #endif
876   } else {
877     a->hmatrix = new HMatrix(A->rmap->n,A->symmetric);
878   }
879   PetscCall(MatH2OpusInferCoordinates_Private(A));
880   PetscCheck(a->ptcloud,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing pointcloud");
881   if (a->kernel) {
882     BoxEntryGen<PetscScalar, H2OPUS_HWTYPE_CPU, PetscFunctionGenerator<PetscScalar>> entry_gen(*a->kernel);
883     if (size > 1) {
884       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
885 #if defined(H2OPUS_USE_MPI)
886       buildDistributedHMatrix(*a->dist_hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord,a->handle);
887 #endif
888     } else {
889       buildHMatrix(*a->hmatrix,a->ptcloud,adm,entry_gen,a->leafsize,a->basisord);
890     }
891     kernel = PETSC_TRUE;
892   } else {
893     PetscCheckFalse(size > 1,comm,PETSC_ERR_SUP,"Construction from sampling not supported in parallel");
894     buildHMatrixStructure(*a->hmatrix,a->ptcloud,a->leafsize,adm);
895   }
896   PetscCall(MatSetUpMultiply_H2OPUS(A));
897 
898 #if defined(PETSC_H2OPUS_USE_GPU)
899   boundtocpu = A->boundtocpu;
900   if (!boundtocpu) {
901     if (size > 1) {
902       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing distributed CPU matrix");
903 #if defined(H2OPUS_USE_MPI)
904       a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
905 #endif
906     } else {
907       a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
908     }
909   }
910 #endif
911   if (size == 1) {
912     if (!kernel && a->sampler && a->sampler->GetSamplingMat()) {
913       PetscReal Anorm;
914       bool      verbose;
915 
916       PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_hara_verbose",&a->hara_verbose,NULL));
917       verbose = a->hara_verbose;
918       PetscCall(MatApproximateNorm_Private(a->sampler->GetSamplingMat(),NORM_2,a->norm_max_samples,&Anorm));
919       if (a->hara_verbose) PetscCall(PetscPrintf(PETSC_COMM_SELF,"Sampling uses max rank %d, tol %g (%g*%g), %s samples %d\n",a->max_rank,a->rtol*Anorm,a->rtol,Anorm,boundtocpu ? "CPU" : "GPU",a->bs));
920       if (a->sf && !a->nativemult) {
921         a->sampler->SetIndexMap(a->hmatrix->u_basis_tree.index_map.size(),a->hmatrix->u_basis_tree.index_map.data());
922       }
923       a->sampler->SetStream(handle->getMainStream());
924       if (boundtocpu) {
925         a->sampler->SetGPUSampling(false);
926         hara(a->sampler, *a->hmatrix, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
927 #if defined(PETSC_H2OPUS_USE_GPU)
928       } else {
929         a->sampler->SetGPUSampling(true);
930         hara(a->sampler, *a->hmatrix_gpu, a->max_rank, 10 /* TODO */,a->rtol*Anorm,a->bs,handle,verbose);
931 #endif
932       }
933       samplingdone = PETSC_TRUE;
934     }
935   }
936 #if defined(PETSC_H2OPUS_USE_GPU)
937   if (!boundtocpu) {
938     delete a->hmatrix;
939     delete a->dist_hmatrix;
940     a->hmatrix = NULL;
941     a->dist_hmatrix = NULL;
942   }
943   A->offloadmask = boundtocpu ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
944 #endif
945   PetscCall(PetscLogEventEnd(MAT_H2Opus_Build,A,0,0,0));
946 
947   if (!a->s) a->s = 1.0;
948   A->assembled = PETSC_TRUE;
949 
950   if (samplingdone) {
951     PetscBool check = a->check_construction;
952     PetscBool checke = PETSC_FALSE;
953 
954     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check",&check,NULL));
955     PetscCall(PetscOptionsGetBool(((PetscObject)A)->options,((PetscObject)A)->prefix,"-mat_h2opus_check_explicit",&checke,NULL));
956     if (check) {
957       Mat       E,Ae;
958       PetscReal n1,ni,n2;
959       PetscReal n1A,niA,n2A;
960       void      (*normfunc)(void);
961 
962       Ae   = a->sampler->GetSamplingMat();
963       PetscCall(MatConvert(A,MATSHELL,MAT_INITIAL_MATRIX,&E));
964       PetscCall(MatShellSetOperation(E,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
965       PetscCall(MatAXPY(E,-1.0,Ae,DIFFERENT_NONZERO_PATTERN));
966       PetscCall(MatNorm(E,NORM_1,&n1));
967       PetscCall(MatNorm(E,NORM_INFINITY,&ni));
968       PetscCall(MatNorm(E,NORM_2,&n2));
969       if (checke) {
970         Mat eA,eE,eAe;
971 
972         PetscCall(MatComputeOperator(A,MATAIJ,&eA));
973         PetscCall(MatComputeOperator(E,MATAIJ,&eE));
974         PetscCall(MatComputeOperator(Ae,MATAIJ,&eAe));
975         PetscCall(MatChop(eA,PETSC_SMALL));
976         PetscCall(MatChop(eE,PETSC_SMALL));
977         PetscCall(MatChop(eAe,PETSC_SMALL));
978         PetscCall(PetscObjectSetName((PetscObject)eA,"H2Mat"));
979         PetscCall(MatView(eA,NULL));
980         PetscCall(PetscObjectSetName((PetscObject)eAe,"S"));
981         PetscCall(MatView(eAe,NULL));
982         PetscCall(PetscObjectSetName((PetscObject)eE,"H2Mat - S"));
983         PetscCall(MatView(eE,NULL));
984         PetscCall(MatDestroy(&eA));
985         PetscCall(MatDestroy(&eE));
986         PetscCall(MatDestroy(&eAe));
987       }
988 
989       PetscCall(MatGetOperation(Ae,MATOP_NORM,&normfunc));
990       PetscCall(MatSetOperation(Ae,MATOP_NORM,(void (*)(void))MatNorm_H2OPUS));
991       PetscCall(MatNorm(Ae,NORM_1,&n1A));
992       PetscCall(MatNorm(Ae,NORM_INFINITY,&niA));
993       PetscCall(MatNorm(Ae,NORM_2,&n2A));
994       n1A  = PetscMax(n1A,PETSC_SMALL);
995       n2A  = PetscMax(n2A,PETSC_SMALL);
996       niA  = PetscMax(niA,PETSC_SMALL);
997       PetscCall(MatSetOperation(Ae,MATOP_NORM,normfunc));
998       PetscCall(PetscPrintf(PetscObjectComm((PetscObject)A),"MATH2OPUS construction errors: NORM_1 %g, NORM_INFINITY %g, NORM_2 %g (%g %g %g)\n",(double)n1,(double)ni,(double)n2,(double)(n1/n1A),(double)(ni/niA),(double)(n2/n2A)));
999       PetscCall(MatDestroy(&E));
1000     }
1001     a->sampler->SetSamplingMat(NULL);
1002   }
1003   PetscFunctionReturn(0);
1004 }
1005 
1006 static PetscErrorCode MatZeroEntries_H2OPUS(Mat A)
1007 {
1008   PetscMPIInt    size;
1009   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1010 
1011   PetscFunctionBegin;
1012   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1013   PetscCheckFalse(size > 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet supported");
1014   else {
1015     a->hmatrix->clearData();
1016 #if defined(PETSC_H2OPUS_USE_GPU)
1017     if (a->hmatrix_gpu) a->hmatrix_gpu->clearData();
1018 #endif
1019   }
1020   PetscFunctionReturn(0);
1021 }
1022 
1023 static PetscErrorCode MatDuplicate_H2OPUS(Mat B, MatDuplicateOption op, Mat *nA)
1024 {
1025   Mat            A;
1026   Mat_H2OPUS     *a, *b = (Mat_H2OPUS*)B->data;
1027 #if defined(PETSC_H2OPUS_USE_GPU)
1028   PetscBool      iscpu = PETSC_FALSE;
1029 #else
1030   PetscBool      iscpu = PETSC_TRUE;
1031 #endif
1032   MPI_Comm       comm;
1033 
1034   PetscFunctionBegin;
1035   PetscCall(PetscObjectGetComm((PetscObject)B,&comm));
1036   PetscCall(MatCreate(comm,&A));
1037   PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N));
1038   PetscCall(MatSetType(A,MATH2OPUS));
1039   PetscCall(MatPropagateSymmetryOptions(B,A));
1040   a = (Mat_H2OPUS*)A->data;
1041 
1042   a->eta              = b->eta;
1043   a->leafsize         = b->leafsize;
1044   a->basisord         = b->basisord;
1045   a->max_rank         = b->max_rank;
1046   a->bs               = b->bs;
1047   a->rtol             = b->rtol;
1048   a->norm_max_samples = b->norm_max_samples;
1049   if (op == MAT_COPY_VALUES) a->s = b->s;
1050 
1051   a->ptcloud = new PetscPointCloud<PetscReal>(*b->ptcloud);
1052   if (op == MAT_COPY_VALUES && b->kernel) a->kernel = new PetscFunctionGenerator<PetscScalar>(*b->kernel);
1053 
1054 #if defined(H2OPUS_USE_MPI)
1055   if (b->dist_hmatrix) { a->dist_hmatrix = new DistributedHMatrix(*b->dist_hmatrix); }
1056 #if defined(PETSC_H2OPUS_USE_GPU)
1057   if (b->dist_hmatrix_gpu) { a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*b->dist_hmatrix_gpu); }
1058 #endif
1059 #endif
1060   if (b->hmatrix) {
1061     a->hmatrix = new HMatrix(*b->hmatrix);
1062     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix->clearData();
1063   }
1064 #if defined(PETSC_H2OPUS_USE_GPU)
1065   if (b->hmatrix_gpu) {
1066     a->hmatrix_gpu = new HMatrix_GPU(*b->hmatrix_gpu);
1067     if (op == MAT_DO_NOT_COPY_VALUES) a->hmatrix_gpu->clearData();
1068   }
1069 #endif
1070   if (b->sf) {
1071     PetscCall(PetscObjectReference((PetscObject)b->sf));
1072     a->sf = b->sf;
1073   }
1074   if (b->h2opus_indexmap) {
1075     PetscCall(PetscObjectReference((PetscObject)b->h2opus_indexmap));
1076     a->h2opus_indexmap = b->h2opus_indexmap;
1077   }
1078 
1079   PetscCall(MatSetUp(A));
1080   PetscCall(MatSetUpMultiply_H2OPUS(A));
1081   if (op == MAT_COPY_VALUES) {
1082     A->assembled = PETSC_TRUE;
1083     a->orthogonal = b->orthogonal;
1084 #if defined(PETSC_H2OPUS_USE_GPU)
1085     A->offloadmask = B->offloadmask;
1086 #endif
1087   }
1088 #if defined(PETSC_H2OPUS_USE_GPU)
1089   iscpu = B->boundtocpu;
1090 #endif
1091   PetscCall(MatBindToCPU(A,iscpu));
1092 
1093   *nA = A;
1094   PetscFunctionReturn(0);
1095 }
1096 
1097 static PetscErrorCode MatView_H2OPUS(Mat A, PetscViewer view)
1098 {
1099   Mat_H2OPUS        *h2opus = (Mat_H2OPUS*)A->data;
1100   PetscBool         isascii;
1101   PetscMPIInt       size;
1102   PetscViewerFormat format;
1103 
1104   PetscFunctionBegin;
1105   PetscCall(PetscObjectTypeCompare((PetscObject)view,PETSCVIEWERASCII,&isascii));
1106   PetscCall(PetscViewerGetFormat(view,&format));
1107   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1108   if (isascii) {
1109     if (format == PETSC_VIEWER_ASCII_MATLAB) {
1110       if (size == 1) {
1111         FILE *fp;
1112         PetscCall(PetscViewerASCIIGetPointer(view,&fp));
1113         dumpHMatrix(*h2opus->hmatrix,6,fp);
1114       }
1115     } else {
1116       PetscCall(PetscViewerASCIIPrintf(view,"  H-Matrix constructed from %s\n",h2opus->kernel ? "Kernel" : "Mat"));
1117       PetscCall(PetscViewerASCIIPrintf(view,"  PointCloud dim %" PetscInt_FMT "\n",h2opus->ptcloud ? h2opus->ptcloud->getDimension() : 0));
1118       PetscCall(PetscViewerASCIIPrintf(view,"  Admissibility parameters: leaf size %" PetscInt_FMT ", eta %g\n",h2opus->leafsize,(double)h2opus->eta));
1119       if (!h2opus->kernel) {
1120         PetscCall(PetscViewerASCIIPrintf(view,"  Sampling parameters: max_rank %" PetscInt_FMT ", samples %" PetscInt_FMT ", tolerance %g\n",h2opus->max_rank,h2opus->bs,(double)h2opus->rtol));
1121       } else {
1122         PetscCall(PetscViewerASCIIPrintf(view,"  Offdiagonal blocks approximation order %" PetscInt_FMT "\n",h2opus->basisord));
1123       }
1124       PetscCall(PetscViewerASCIIPrintf(view,"  Number of samples for norms %" PetscInt_FMT "\n",h2opus->norm_max_samples));
1125       if (size == 1) {
1126         double dense_mem_cpu = h2opus->hmatrix ? h2opus->hmatrix->getDenseMemoryUsage() : 0;
1127         double low_rank_cpu = h2opus->hmatrix ? h2opus->hmatrix->getLowRankMemoryUsage() : 0;
1128 #if defined(PETSC_HAVE_CUDA)
1129         double dense_mem_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getDenseMemoryUsage() : 0;
1130         double low_rank_gpu = h2opus->hmatrix_gpu ? h2opus->hmatrix_gpu->getLowRankMemoryUsage() : 0;
1131 #endif
1132         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_cpu, low_rank_cpu, low_rank_cpu + dense_mem_cpu));
1133 #if defined(PETSC_HAVE_CUDA)
1134         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", dense_mem_gpu, low_rank_gpu, low_rank_gpu + dense_mem_gpu));
1135 #endif
1136       } else {
1137 #if defined(PETSC_HAVE_CUDA)
1138         double      matrix_mem[4] = {0.,0.,0.,0.};
1139         PetscMPIInt rsize = 4;
1140 #else
1141         double      matrix_mem[2] = {0.,0.};
1142         PetscMPIInt rsize = 2;
1143 #endif
1144 #if defined(H2OPUS_USE_MPI)
1145         matrix_mem[0] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalDenseMemoryUsage() : 0;
1146         matrix_mem[1] = h2opus->dist_hmatrix ? h2opus->dist_hmatrix->getLocalLowRankMemoryUsage() : 0;
1147 #if defined(PETSC_HAVE_CUDA)
1148         matrix_mem[2] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalDenseMemoryUsage() : 0;
1149         matrix_mem[3] = h2opus->dist_hmatrix_gpu ? h2opus->dist_hmatrix_gpu->getLocalLowRankMemoryUsage() : 0;
1150 #endif
1151 #endif
1152         PetscCall(MPIU_Allreduce(MPI_IN_PLACE,matrix_mem,rsize,MPI_DOUBLE_PRECISION,MPI_SUM,PetscObjectComm((PetscObject)A)));
1153         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (CPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[0], matrix_mem[1], matrix_mem[0] + matrix_mem[1]));
1154 #if defined(PETSC_HAVE_CUDA)
1155         PetscCall(PetscViewerASCIIPrintf(view,"  Memory consumption GB (GPU): %g (dense) %g (low rank) %g (total)\n", matrix_mem[2], matrix_mem[3], matrix_mem[2] + matrix_mem[3]));
1156 #endif
1157       }
1158     }
1159   }
1160 #if 0
1161   if (size == 1) {
1162     char filename[256];
1163     const char *name;
1164 
1165     PetscCall(PetscObjectGetName((PetscObject)A,&name));
1166     PetscCall(PetscSNPrintf(filename,sizeof(filename),"%s_structure.eps",name));
1167     outputEps(*h2opus->hmatrix,filename);
1168   }
1169 #endif
1170   PetscFunctionReturn(0);
1171 }
1172 
1173 static PetscErrorCode MatH2OpusSetCoords_H2OPUS(Mat A, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx)
1174 {
1175   Mat_H2OPUS     *h2opus = (Mat_H2OPUS*)A->data;
1176   PetscReal      *gcoords;
1177   PetscInt       N;
1178   MPI_Comm       comm;
1179   PetscMPIInt    size;
1180   PetscBool      cong;
1181 
1182   PetscFunctionBegin;
1183   PetscCall(PetscLayoutSetUp(A->rmap));
1184   PetscCall(PetscLayoutSetUp(A->cmap));
1185   PetscCall(PetscObjectGetComm((PetscObject)A,&comm));
1186   PetscCall(MatHasCongruentLayouts(A,&cong));
1187   PetscCheck(cong,comm,PETSC_ERR_SUP,"Only for square matrices with congruent layouts");
1188   N    = A->rmap->N;
1189   PetscCallMPI(MPI_Comm_size(comm,&size));
1190   if (spacedim > 0 && size > 1 && cdist) {
1191     PetscSF      sf;
1192     MPI_Datatype dtype;
1193 
1194     PetscCallMPI(MPI_Type_contiguous(spacedim,MPIU_REAL,&dtype));
1195     PetscCallMPI(MPI_Type_commit(&dtype));
1196 
1197     PetscCall(PetscSFCreate(comm,&sf));
1198     PetscCall(PetscSFSetGraphWithPattern(sf,A->rmap,PETSCSF_PATTERN_ALLGATHER));
1199     PetscCall(PetscMalloc1(spacedim*N,&gcoords));
1200     PetscCall(PetscSFBcastBegin(sf,dtype,coords,gcoords,MPI_REPLACE));
1201     PetscCall(PetscSFBcastEnd(sf,dtype,coords,gcoords,MPI_REPLACE));
1202     PetscCall(PetscSFDestroy(&sf));
1203     PetscCallMPI(MPI_Type_free(&dtype));
1204   } else gcoords = (PetscReal*)coords;
1205 
1206   delete h2opus->ptcloud;
1207   delete h2opus->kernel;
1208   h2opus->ptcloud = new PetscPointCloud<PetscReal>(spacedim,N,gcoords);
1209   if (kernel) h2opus->kernel = new PetscFunctionGenerator<PetscScalar>(kernel,spacedim,kernelctx);
1210   if (gcoords != coords) PetscCall(PetscFree(gcoords));
1211   A->preallocated = PETSC_TRUE;
1212   PetscFunctionReturn(0);
1213 }
1214 
1215 #if defined(PETSC_H2OPUS_USE_GPU)
1216 static PetscErrorCode MatBindToCPU_H2OPUS(Mat A, PetscBool flg)
1217 {
1218   PetscMPIInt    size;
1219   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1220 
1221   PetscFunctionBegin;
1222   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1223   if (flg && A->offloadmask == PETSC_OFFLOAD_GPU) {
1224     if (size > 1) {
1225       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1226 #if defined(H2OPUS_USE_MPI)
1227       if (!a->dist_hmatrix) a->dist_hmatrix = new DistributedHMatrix(*a->dist_hmatrix_gpu);
1228       else *a->dist_hmatrix = *a->dist_hmatrix_gpu;
1229 #endif
1230     } else {
1231       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1232       if (!a->hmatrix) a->hmatrix = new HMatrix(*a->hmatrix_gpu);
1233       else *a->hmatrix = *a->hmatrix_gpu;
1234     }
1235     delete a->hmatrix_gpu;
1236     delete a->dist_hmatrix_gpu;
1237     a->hmatrix_gpu = NULL;
1238     a->dist_hmatrix_gpu = NULL;
1239     A->offloadmask = PETSC_OFFLOAD_CPU;
1240   } else if (!flg && A->offloadmask == PETSC_OFFLOAD_CPU) {
1241     if (size > 1) {
1242       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1243 #if defined(H2OPUS_USE_MPI)
1244       if (!a->dist_hmatrix_gpu) a->dist_hmatrix_gpu = new DistributedHMatrix_GPU(*a->dist_hmatrix);
1245       else *a->dist_hmatrix_gpu = *a->dist_hmatrix;
1246 #endif
1247     } else {
1248       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1249       if (!a->hmatrix_gpu) a->hmatrix_gpu = new HMatrix_GPU(*a->hmatrix);
1250       else *a->hmatrix_gpu = *a->hmatrix;
1251     }
1252     delete a->hmatrix;
1253     delete a->dist_hmatrix;
1254     a->hmatrix = NULL;
1255     a->dist_hmatrix = NULL;
1256     A->offloadmask = PETSC_OFFLOAD_GPU;
1257   }
1258   PetscCall(PetscFree(A->defaultvectype));
1259   if (!flg) {
1260     PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
1261   } else {
1262     PetscCall(PetscStrallocpy(VECSTANDARD,&A->defaultvectype));
1263   }
1264   A->boundtocpu = flg;
1265   PetscFunctionReturn(0);
1266 }
1267 #endif
1268 
1269 /*MC
1270      MATH2OPUS = "h2opus" - A matrix type for hierarchical matrices using the H2Opus package.
1271 
1272    Options Database Keys:
1273 .     -mat_type h2opus - matrix type to "h2opus" during a call to MatSetFromOptions()
1274 
1275    Notes:
1276      H2Opus implements hierarchical matrices in the H^2 flavour.
1277      It supports CPU or NVIDIA GPUs.
1278      For CPU only builds, use ./configure --download-h2opus --download-thrust to install PETSc to use H2Opus.
1279      In order to run on NVIDIA GPUs, use ./configure --download-h2opus --download-magma --download-kblas.
1280      For details and additional references, see
1281        "H2Opus: A distributed-memory multi-GPU software package for non-local operators",
1282      available at https://arxiv.org/abs/2109.05451.
1283 
1284    Level: beginner
1285 
1286 .seealso: MATHTOOL, MATDENSE, MatCreateH2OpusFromKernel(), MatCreateH2OpusFromMat()
1287 M*/
1288 PETSC_EXTERN PetscErrorCode MatCreate_H2OPUS(Mat A)
1289 {
1290   Mat_H2OPUS     *a;
1291   PetscMPIInt    size;
1292 
1293   PetscFunctionBegin;
1294 #if defined(PETSC_H2OPUS_USE_GPU)
1295   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1296 #endif
1297   PetscCall(PetscNewLog(A,&a));
1298   A->data = (void*)a;
1299 
1300   a->eta              = 0.9;
1301   a->leafsize         = 32;
1302   a->basisord         = 4;
1303   a->max_rank         = 64;
1304   a->bs               = 32;
1305   a->rtol             = 1.e-4;
1306   a->s                = 1.0;
1307   a->norm_max_samples = 10;
1308 #if defined(H2OPUS_USE_MPI)
1309   h2opusCreateDistributedHandleComm(&a->handle,PetscObjectComm((PetscObject)A));
1310 #else
1311   h2opusCreateHandle(&a->handle);
1312 #endif
1313   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1314   PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATH2OPUS));
1315   PetscCall(PetscMemzero(A->ops,sizeof(struct _MatOps)));
1316 
1317   A->ops->destroy          = MatDestroy_H2OPUS;
1318   A->ops->view             = MatView_H2OPUS;
1319   A->ops->assemblyend      = MatAssemblyEnd_H2OPUS;
1320   A->ops->mult             = MatMult_H2OPUS;
1321   A->ops->multtranspose    = MatMultTranspose_H2OPUS;
1322   A->ops->multadd          = MatMultAdd_H2OPUS;
1323   A->ops->multtransposeadd = MatMultTransposeAdd_H2OPUS;
1324   A->ops->scale            = MatScale_H2OPUS;
1325   A->ops->duplicate        = MatDuplicate_H2OPUS;
1326   A->ops->setfromoptions   = MatSetFromOptions_H2OPUS;
1327   A->ops->norm             = MatNorm_H2OPUS;
1328   A->ops->zeroentries      = MatZeroEntries_H2OPUS;
1329 #if defined(PETSC_H2OPUS_USE_GPU)
1330   A->ops->bindtocpu        = MatBindToCPU_H2OPUS;
1331 #endif
1332 
1333   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdense_C",MatProductSetFromOptions_H2OPUS));
1334   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_seqdensecuda_C",MatProductSetFromOptions_H2OPUS));
1335   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidense_C",MatProductSetFromOptions_H2OPUS));
1336   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_h2opus_mpidensecuda_C",MatProductSetFromOptions_H2OPUS));
1337 #if defined(PETSC_H2OPUS_USE_GPU)
1338   PetscCall(PetscFree(A->defaultvectype));
1339   PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
1340 #endif
1341   PetscFunctionReturn(0);
1342 }
1343 
1344 /*@C
1345      MatH2OpusOrthogonalize - Orthogonalize the basis tree of a hierarchical matrix.
1346 
1347    Input Parameter:
1348 .     A - the matrix
1349 
1350    Level: intermediate
1351 
1352 .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress()
1353 @*/
1354 PetscErrorCode MatH2OpusOrthogonalize(Mat A)
1355 {
1356   PetscBool      ish2opus;
1357   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1358   PetscMPIInt    size;
1359   PetscBool      boundtocpu = PETSC_TRUE;
1360 
1361   PetscFunctionBegin;
1362   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1363   PetscValidType(A,1);
1364   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1365   if (!ish2opus) PetscFunctionReturn(0);
1366   if (a->orthogonal) PetscFunctionReturn(0);
1367   HLibProfile::clear();
1368   PetscCall(PetscLogEventBegin(MAT_H2Opus_Orthog,A,0,0,0));
1369 #if defined(PETSC_H2OPUS_USE_GPU)
1370   boundtocpu = A->boundtocpu;
1371 #endif
1372   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1373   if (size > 1) {
1374     if (boundtocpu) {
1375       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1376 #if defined(H2OPUS_USE_MPI)
1377       distributed_horthog(*a->dist_hmatrix, a->handle);
1378 #endif
1379 #if defined(PETSC_H2OPUS_USE_GPU)
1380       A->offloadmask = PETSC_OFFLOAD_CPU;
1381     } else {
1382       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1383       PetscCall(PetscLogGpuTimeBegin());
1384 #if defined(H2OPUS_USE_MPI)
1385       distributed_horthog(*a->dist_hmatrix_gpu, a->handle);
1386 #endif
1387       PetscCall(PetscLogGpuTimeEnd());
1388 #endif
1389     }
1390   } else {
1391 #if defined(H2OPUS_USE_MPI)
1392     h2opusHandle_t handle = a->handle->handle;
1393 #else
1394     h2opusHandle_t handle = a->handle;
1395 #endif
1396     if (boundtocpu) {
1397       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1398       horthog(*a->hmatrix, handle);
1399 #if defined(PETSC_H2OPUS_USE_GPU)
1400       A->offloadmask = PETSC_OFFLOAD_CPU;
1401     } else {
1402       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1403       PetscCall(PetscLogGpuTimeBegin());
1404       horthog(*a->hmatrix_gpu, handle);
1405       PetscCall(PetscLogGpuTimeEnd());
1406 #endif
1407     }
1408   }
1409   a->orthogonal = PETSC_TRUE;
1410   { /* log flops */
1411     double gops,time,perf,dev;
1412     HLibProfile::getHorthogPerf(gops,time,perf,dev);
1413 #if defined(PETSC_H2OPUS_USE_GPU)
1414     if (boundtocpu) {
1415       PetscCall(PetscLogFlops(1e9*gops));
1416     } else {
1417       PetscCall(PetscLogGpuFlops(1e9*gops));
1418     }
1419 #else
1420     PetscCall(PetscLogFlops(1e9*gops));
1421 #endif
1422   }
1423   PetscCall(PetscLogEventEnd(MAT_H2Opus_Orthog,A,0,0,0));
1424   PetscFunctionReturn(0);
1425 }
1426 
1427 /*@C
1428      MatH2OpusCompress - Compress a hierarchical matrix.
1429 
1430    Input Parameters:
1431 +     A - the matrix
1432 -     tol - the absolute truncation threshold
1433 
1434    Level: intermediate
1435 
1436 .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusOrthogonalize()
1437 @*/
1438 PetscErrorCode MatH2OpusCompress(Mat A, PetscReal tol)
1439 {
1440   PetscBool      ish2opus;
1441   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1442   PetscMPIInt    size;
1443   PetscBool      boundtocpu = PETSC_TRUE;
1444 
1445   PetscFunctionBegin;
1446   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1447   PetscValidType(A,1);
1448   PetscValidLogicalCollectiveReal(A,tol,2);
1449   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1450   if (!ish2opus || tol <= 0.0) PetscFunctionReturn(0);
1451   PetscCall(MatH2OpusOrthogonalize(A));
1452   HLibProfile::clear();
1453   PetscCall(PetscLogEventBegin(MAT_H2Opus_Compress,A,0,0,0));
1454 #if defined(PETSC_H2OPUS_USE_GPU)
1455   boundtocpu = A->boundtocpu;
1456 #endif
1457   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1458   if (size > 1) {
1459     if (boundtocpu) {
1460       PetscCheck(a->dist_hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1461 #if defined(H2OPUS_USE_MPI)
1462       distributed_hcompress(*a->dist_hmatrix, tol, a->handle);
1463 #endif
1464 #if defined(PETSC_H2OPUS_USE_GPU)
1465       A->offloadmask = PETSC_OFFLOAD_CPU;
1466     } else {
1467       PetscCheck(a->dist_hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1468       PetscCall(PetscLogGpuTimeBegin());
1469 #if defined(H2OPUS_USE_MPI)
1470       distributed_hcompress(*a->dist_hmatrix_gpu, tol, a->handle);
1471 #endif
1472       PetscCall(PetscLogGpuTimeEnd());
1473 #endif
1474     }
1475   } else {
1476 #if defined(H2OPUS_USE_MPI)
1477     h2opusHandle_t handle = a->handle->handle;
1478 #else
1479     h2opusHandle_t handle = a->handle;
1480 #endif
1481     if (boundtocpu) {
1482       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1483       hcompress(*a->hmatrix, tol, handle);
1484 #if defined(PETSC_H2OPUS_USE_GPU)
1485       A->offloadmask = PETSC_OFFLOAD_CPU;
1486     } else {
1487       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1488       PetscCall(PetscLogGpuTimeBegin());
1489       hcompress(*a->hmatrix_gpu, tol, handle);
1490       PetscCall(PetscLogGpuTimeEnd());
1491 #endif
1492     }
1493   }
1494   { /* log flops */
1495     double gops,time,perf,dev;
1496     HLibProfile::getHcompressPerf(gops,time,perf,dev);
1497 #if defined(PETSC_H2OPUS_USE_GPU)
1498     if (boundtocpu) {
1499       PetscCall(PetscLogFlops(1e9*gops));
1500     } else {
1501       PetscCall(PetscLogGpuFlops(1e9*gops));
1502     }
1503 #else
1504     PetscCall(PetscLogFlops(1e9*gops));
1505 #endif
1506   }
1507   PetscCall(PetscLogEventEnd(MAT_H2Opus_Compress,A,0,0,0));
1508   PetscFunctionReturn(0);
1509 }
1510 
1511 /*@C
1512      MatH2OpusSetSamplingMat - Set a matrix to be sampled from matrix vector product to construct a hierarchical matrix.
1513 
1514    Input Parameters:
1515 +     A - the hierarchical matrix
1516 .     B - the matrix to be sampled
1517 .     bs - maximum number of samples to be taken concurrently
1518 -     tol - relative tolerance for construction
1519 
1520    Notes: Need to call MatAssemblyBegin/End() to update the hierarchical matrix.
1521 
1522    Level: intermediate
1523 
1524 .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize()
1525 @*/
1526 PetscErrorCode MatH2OpusSetSamplingMat(Mat A, Mat B, PetscInt bs, PetscReal tol)
1527 {
1528   PetscBool      ish2opus;
1529 
1530   PetscFunctionBegin;
1531   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1532   PetscValidType(A,1);
1533   if (B) PetscValidHeaderSpecific(B,MAT_CLASSID,2);
1534   PetscValidLogicalCollectiveInt(A,bs,3);
1535   PetscValidLogicalCollectiveReal(A,tol,3);
1536   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1537   if (ish2opus) {
1538     Mat_H2OPUS *a = (Mat_H2OPUS*)A->data;
1539 
1540     if (!a->sampler) a->sampler = new PetscMatrixSampler();
1541     a->sampler->SetSamplingMat(B);
1542     if (bs > 0) a->bs = bs;
1543     if (tol > 0.) a->rtol = tol;
1544     delete a->kernel;
1545   }
1546   PetscFunctionReturn(0);
1547 }
1548 
1549 /*@C
1550      MatCreateH2OpusFromKernel - Creates a MATH2OPUS from a user-supplied kernel.
1551 
1552    Input Parameters:
1553 +     comm - MPI communicator
1554 .     m - number of local rows (or PETSC_DECIDE to have calculated if M is given)
1555 .     n - number of local columns (or PETSC_DECIDE to have calculated if N is given)
1556 .     M - number of global rows (or PETSC_DETERMINE to have calculated if m is given)
1557 .     N - number of global columns (or PETSC_DETERMINE to have calculated if n is given)
1558 .     spacedim - dimension of the space coordinates
1559 .     coords - coordinates of the points
1560 .     cdist - whether or not coordinates are distributed
1561 .     kernel - computational kernel (or NULL)
1562 .     kernelctx - kernel context
1563 .     eta - admissibility condition tolerance
1564 .     leafsize - leaf size in cluster tree
1565 -     basisord - approximation order for Chebychev interpolation of low-rank blocks
1566 
1567    Output Parameter:
1568 .     nA - matrix
1569 
1570    Options Database Keys:
1571 +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1572 .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1573 .     -mat_h2opus_order <PetscInt> - Chebychev approximation order
1574 -     -mat_h2opus_normsamples <PetscInt> - Maximum number of samples to be used when estimating norms
1575 
1576    Level: intermediate
1577 
1578 .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat()
1579 @*/
1580 PetscErrorCode MatCreateH2OpusFromKernel(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, MatH2OpusKernel kernel, void *kernelctx, PetscReal eta, PetscInt leafsize, PetscInt basisord, Mat* nA)
1581 {
1582   Mat            A;
1583   Mat_H2OPUS     *h2opus;
1584 #if defined(PETSC_H2OPUS_USE_GPU)
1585   PetscBool      iscpu = PETSC_FALSE;
1586 #else
1587   PetscBool      iscpu = PETSC_TRUE;
1588 #endif
1589 
1590   PetscFunctionBegin;
1591   PetscCheckFalse(m != n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1592   PetscCall(MatCreate(comm,&A));
1593   PetscCall(MatSetSizes(A,m,n,M,N));
1594   PetscCheckFalse(M != N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1595   PetscCall(MatSetType(A,MATH2OPUS));
1596   PetscCall(MatBindToCPU(A,iscpu));
1597   PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,kernel,kernelctx));
1598 
1599   h2opus = (Mat_H2OPUS*)A->data;
1600   if (eta > 0.) h2opus->eta = eta;
1601   if (leafsize > 0) h2opus->leafsize = leafsize;
1602   if (basisord > 0) h2opus->basisord = basisord;
1603 
1604   *nA = A;
1605   PetscFunctionReturn(0);
1606 }
1607 
1608 /*@C
1609      MatCreateH2OpusFromMat - Creates a MATH2OPUS sampling from a user-supplied operator.
1610 
1611    Input Parameters:
1612 +     B - the matrix to be sampled
1613 .     spacedim - dimension of the space coordinates
1614 .     coords - coordinates of the points
1615 .     cdist - whether or not coordinates are distributed
1616 .     eta - admissibility condition tolerance
1617 .     leafsize - leaf size in cluster tree
1618 .     maxrank - maximum rank allowed
1619 .     bs - maximum number of samples to be taken concurrently
1620 -     rtol - relative tolerance for construction
1621 
1622    Output Parameter:
1623 .     nA - matrix
1624 
1625    Options Database Keys:
1626 +     -mat_h2opus_leafsize <PetscInt> - Leaf size of cluster tree
1627 .     -mat_h2opus_eta <PetscReal> - Admissibility condition tolerance
1628 .     -mat_h2opus_maxrank <PetscInt> - Maximum rank when constructed from matvecs
1629 .     -mat_h2opus_samples <PetscInt> - Maximum number of samples to be taken concurrently when constructing from matvecs
1630 .     -mat_h2opus_rtol <PetscReal> - Relative tolerance for construction from sampling
1631 .     -mat_h2opus_check <PetscBool> - Check error when constructing from sampling during MatAssemblyEnd()
1632 .     -mat_h2opus_hara_verbose <PetscBool> - Verbose output from hara construction
1633 -     -mat_h2opus_normsamples <PetscInt> - Maximum bumber of samples to be when estimating norms
1634 
1635    Notes: not available in parallel
1636 
1637    Level: intermediate
1638 
1639 .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromKernel()
1640 @*/
1641 PetscErrorCode MatCreateH2OpusFromMat(Mat B, PetscInt spacedim, const PetscReal coords[], PetscBool cdist, PetscReal eta, PetscInt leafsize, PetscInt maxrank, PetscInt bs, PetscReal rtol, Mat *nA)
1642 {
1643   Mat            A;
1644   Mat_H2OPUS     *h2opus;
1645   MPI_Comm       comm;
1646   PetscBool      boundtocpu = PETSC_TRUE;
1647 
1648   PetscFunctionBegin;
1649   PetscValidHeaderSpecific(B,MAT_CLASSID,1);
1650   PetscValidLogicalCollectiveInt(B,spacedim,2);
1651   PetscValidLogicalCollectiveBool(B,cdist,4);
1652   PetscValidLogicalCollectiveReal(B,eta,5);
1653   PetscValidLogicalCollectiveInt(B,leafsize,6);
1654   PetscValidLogicalCollectiveInt(B,maxrank,7);
1655   PetscValidLogicalCollectiveInt(B,bs,8);
1656   PetscValidLogicalCollectiveReal(B,rtol,9);
1657   PetscValidPointer(nA,10);
1658   PetscCall(PetscObjectGetComm((PetscObject)B,&comm));
1659   PetscCheckFalse(B->rmap->n != B->cmap->n,PETSC_COMM_SELF,PETSC_ERR_SUP,"Different row and column local sizes are not supported");
1660   PetscCheckFalse(B->rmap->N != B->cmap->N,comm,PETSC_ERR_SUP,"Rectangular matrices are not supported");
1661   PetscCall(MatCreate(comm,&A));
1662   PetscCall(MatSetSizes(A,B->rmap->n,B->cmap->n,B->rmap->N,B->cmap->N));
1663 #if defined(PETSC_H2OPUS_USE_GPU)
1664   {
1665     PetscBool iscuda;
1666     VecType   vtype;
1667 
1668     PetscCall(MatGetVecType(B,&vtype));
1669     PetscCall(PetscStrcmp(vtype,VECCUDA,&iscuda));
1670     if (!iscuda) {
1671       PetscCall(PetscStrcmp(vtype,VECSEQCUDA,&iscuda));
1672       if (!iscuda) {
1673         PetscCall(PetscStrcmp(vtype,VECMPICUDA,&iscuda));
1674       }
1675     }
1676     if (iscuda && !B->boundtocpu) boundtocpu = PETSC_FALSE;
1677   }
1678 #endif
1679   PetscCall(MatSetType(A,MATH2OPUS));
1680   PetscCall(MatBindToCPU(A,boundtocpu));
1681   if (spacedim) {
1682     PetscCall(MatH2OpusSetCoords_H2OPUS(A,spacedim,coords,cdist,NULL,NULL));
1683   }
1684   PetscCall(MatPropagateSymmetryOptions(B,A));
1685   /* PetscCheck(A->symmetric,comm,PETSC_ERR_SUP,"Unsymmetric sampling does not work"); */
1686 
1687   h2opus = (Mat_H2OPUS*)A->data;
1688   h2opus->sampler = new PetscMatrixSampler(B);
1689   if (eta > 0.) h2opus->eta = eta;
1690   if (leafsize > 0) h2opus->leafsize = leafsize;
1691   if (maxrank > 0) h2opus->max_rank = maxrank;
1692   if (bs > 0) h2opus->bs = bs;
1693   if (rtol > 0.) h2opus->rtol = rtol;
1694   *nA = A;
1695   A->preallocated = PETSC_TRUE;
1696   PetscFunctionReturn(0);
1697 }
1698 
1699 /*@C
1700      MatH2OpusGetIndexMap - Access reordering index set.
1701 
1702    Input Parameters:
1703 .     A - the matrix
1704 
1705    Output Parameter:
1706 .     indexmap - the index set for the reordering
1707 
1708    Level: intermediate
1709 
1710 .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1711 @*/
1712 PetscErrorCode MatH2OpusGetIndexMap(Mat A, IS *indexmap)
1713 {
1714   PetscBool      ish2opus;
1715   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1716 
1717   PetscFunctionBegin;
1718   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1719   PetscValidType(A,1);
1720   PetscValidPointer(indexmap,2);
1721   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1722   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1723   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1724   *indexmap = a->h2opus_indexmap;
1725   PetscFunctionReturn(0);
1726 }
1727 
1728 /*@C
1729      MatH2OpusMapVec - Maps a vector between PETSc and H2Opus ordering
1730 
1731    Input Parameters:
1732 +     A - the matrix
1733 .     nativetopetsc - if true, maps from H2Opus ordering to PETSc ordering. If false, applies the reverse map
1734 -     in - the vector to be mapped
1735 
1736    Output Parameter:
1737 .     out - the newly created mapped vector
1738 
1739    Level: intermediate
1740 
1741 .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel()
1742 @*/
1743 PetscErrorCode MatH2OpusMapVec(Mat A, PetscBool nativetopetsc, Vec in, Vec* out)
1744 {
1745   PetscBool      ish2opus;
1746   Mat_H2OPUS     *a = (Mat_H2OPUS*)A->data;
1747   PetscScalar    *xin,*xout;
1748   PetscBool      nm;
1749 
1750   PetscFunctionBegin;
1751   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1752   PetscValidType(A,1);
1753   PetscValidLogicalCollectiveBool(A,nativetopetsc,2);
1754   PetscValidHeaderSpecific(in,VEC_CLASSID,3);
1755   PetscValidPointer(out,4);
1756   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1757   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&ish2opus));
1758   PetscCheck(ish2opus,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not for type %s",((PetscObject)A)->type_name);
1759   nm   = a->nativemult;
1760   PetscCall(MatH2OpusSetNativeMult(A,(PetscBool)!nativetopetsc));
1761   PetscCall(MatCreateVecs(A,out,NULL));
1762   PetscCall(MatH2OpusSetNativeMult(A,nm));
1763   if (!a->sf) { /* same ordering */
1764     PetscCall(VecCopy(in,*out));
1765     PetscFunctionReturn(0);
1766   }
1767   PetscCall(VecGetArrayRead(in,(const PetscScalar**)&xin));
1768   PetscCall(VecGetArrayWrite(*out,&xout));
1769   if (nativetopetsc) {
1770     PetscCall(PetscSFReduceBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1771     PetscCall(PetscSFReduceEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1772   } else {
1773     PetscCall(PetscSFBcastBegin(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1774     PetscCall(PetscSFBcastEnd(a->sf,MPIU_SCALAR,xin,xout,MPI_REPLACE));
1775   }
1776   PetscCall(VecRestoreArrayRead(in,(const PetscScalar**)&xin));
1777   PetscCall(VecRestoreArrayWrite(*out,&xout));
1778   PetscFunctionReturn(0);
1779 }
1780 
1781 /*@C
1782      MatH2OpusLowRankUpdate - Perform a low-rank update of the form A = A + s * U * V^T
1783 
1784    Input Parameters:
1785 +     A - the hierarchical matrix
1786 .     s - the scaling factor
1787 .     U - the dense low-rank update matrix
1788 -     V - (optional) the dense low-rank update matrix (if NULL, then V = U is assumed)
1789 
1790    Notes: The U and V matrices must be in dense format
1791 
1792    Level: intermediate
1793 
1794 .seealso:  MatCreate(), MATH2OPUS, MatCreateH2OpusFromMat(), MatCreateH2OpusFromKernel(), MatH2OpusCompress(), MatH2OpusOrthogonalize(), MATDENSE
1795 @*/
1796 PetscErrorCode MatH2OpusLowRankUpdate(Mat A, Mat U, Mat V, PetscScalar s)
1797 {
1798   PetscBool      flg;
1799 
1800   PetscFunctionBegin;
1801   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
1802   PetscValidType(A,1);
1803   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1804   PetscValidHeaderSpecific(U,MAT_CLASSID,2);
1805   PetscCheckSameComm(A,1,U,2);
1806   if (V) {
1807     PetscValidHeaderSpecific(V,MAT_CLASSID,3);
1808     PetscCheckSameComm(A,1,V,3);
1809   }
1810   PetscValidLogicalCollectiveScalar(A,s,4);
1811 
1812   if (!V) V = U;
1813   PetscCheckFalse(U->cmap->N != V->cmap->N,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Non matching rank update %" PetscInt_FMT " != %" PetscInt_FMT,U->cmap->N,V->cmap->N);
1814   if (!U->cmap->N) PetscFunctionReturn(0);
1815   PetscCall(PetscLayoutCompare(U->rmap,A->rmap,&flg));
1816   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A and U must have the same row layout");
1817   PetscCall(PetscLayoutCompare(V->rmap,A->cmap,&flg));
1818   PetscCheck(flg,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"A column layout must match V row column layout");
1819   PetscCall(PetscObjectTypeCompare((PetscObject)A,MATH2OPUS,&flg));
1820   if (flg) {
1821     Mat_H2OPUS        *a = (Mat_H2OPUS*)A->data;
1822     const PetscScalar *u,*v,*uu,*vv;
1823     PetscInt          ldu,ldv;
1824     PetscMPIInt       size;
1825 #if defined(H2OPUS_USE_MPI)
1826     h2opusHandle_t    handle = a->handle->handle;
1827 #else
1828     h2opusHandle_t    handle = a->handle;
1829 #endif
1830     PetscBool         usesf = (PetscBool)(a->sf && !a->nativemult);
1831     PetscSF           usf,vsf;
1832 
1833     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
1834     PetscCheckFalse(size > 1,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Not yet implemented in parallel");
1835     PetscCall(PetscLogEventBegin(MAT_H2Opus_LR,A,0,0,0));
1836     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)U,&flg,MATSEQDENSE,MATMPIDENSE,""));
1837     PetscCheck(flg,PetscObjectComm((PetscObject)U),PETSC_ERR_SUP,"Not for U of type %s",((PetscObject)U)->type_name);
1838     PetscCall(PetscObjectBaseTypeCompareAny((PetscObject)V,&flg,MATSEQDENSE,MATMPIDENSE,""));
1839     PetscCheck(flg,PetscObjectComm((PetscObject)V),PETSC_ERR_SUP,"Not for V of type %s",((PetscObject)V)->type_name);
1840     PetscCall(MatDenseGetLDA(U,&ldu));
1841     PetscCall(MatDenseGetLDA(V,&ldv));
1842     PetscCall(MatBoundToCPU(A,&flg));
1843     if (usesf) {
1844       PetscInt n;
1845 
1846       PetscCall(MatDenseGetH2OpusVectorSF(U,a->sf,&usf));
1847       PetscCall(MatDenseGetH2OpusVectorSF(V,a->sf,&vsf));
1848       PetscCall(MatH2OpusResizeBuffers_Private(A,U->cmap->N,V->cmap->N));
1849       PetscCall(PetscSFGetGraph(a->sf,NULL,&n,NULL,NULL));
1850       ldu = n;
1851       ldv = n;
1852     }
1853     if (flg) {
1854       PetscCheck(a->hmatrix,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing CPU matrix");
1855       PetscCall(MatDenseGetArrayRead(U,&u));
1856       PetscCall(MatDenseGetArrayRead(V,&v));
1857       if (usesf) {
1858         vv = MatH2OpusGetThrustPointer(*a->yy);
1859         PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1860         PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1861         if (U != V) {
1862           uu = MatH2OpusGetThrustPointer(*a->xx);
1863           PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1864           PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1865         } else uu = vv;
1866       } else { uu = u; vv = v; }
1867       hlru_global(*a->hmatrix,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1868       PetscCall(MatDenseRestoreArrayRead(U,&u));
1869       PetscCall(MatDenseRestoreArrayRead(V,&v));
1870     } else {
1871 #if defined(PETSC_H2OPUS_USE_GPU)
1872       PetscBool flgU, flgV;
1873 
1874       PetscCheck(a->hmatrix_gpu,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing GPU matrix");
1875       PetscCall(PetscObjectTypeCompareAny((PetscObject)U,&flgU,MATSEQDENSE,MATMPIDENSE,""));
1876       if (flgU) PetscCall(MatConvert(U,MATDENSECUDA,MAT_INPLACE_MATRIX,&U));
1877       PetscCall(PetscObjectTypeCompareAny((PetscObject)V,&flgV,MATSEQDENSE,MATMPIDENSE,""));
1878       if (flgV) PetscCall(MatConvert(V,MATDENSECUDA,MAT_INPLACE_MATRIX,&V));
1879       PetscCall(MatDenseCUDAGetArrayRead(U,&u));
1880       PetscCall(MatDenseCUDAGetArrayRead(V,&v));
1881       if (usesf) {
1882         vv = MatH2OpusGetThrustPointer(*a->yy_gpu);
1883         PetscCall(PetscSFBcastBegin(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1884         PetscCall(PetscSFBcastEnd(vsf,MPIU_SCALAR,v,(PetscScalar*)vv,MPI_REPLACE));
1885         if (U != V) {
1886           uu = MatH2OpusGetThrustPointer(*a->xx_gpu);
1887           PetscCall(PetscSFBcastBegin(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1888           PetscCall(PetscSFBcastEnd(usf,MPIU_SCALAR,u,(PetscScalar*)uu,MPI_REPLACE));
1889         } else uu = vv;
1890       } else { uu = u; vv = v; }
1891 #else
1892       SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"This should not happen");
1893 #endif
1894       hlru_global(*a->hmatrix_gpu,uu,ldu,vv,ldv,U->cmap->N,s,handle);
1895 #if defined(PETSC_H2OPUS_USE_GPU)
1896       PetscCall(MatDenseCUDARestoreArrayRead(U,&u));
1897       PetscCall(MatDenseCUDARestoreArrayRead(V,&v));
1898       if (flgU) PetscCall(MatConvert(U,MATDENSE,MAT_INPLACE_MATRIX,&U));
1899       if (flgV) PetscCall(MatConvert(V,MATDENSE,MAT_INPLACE_MATRIX,&V));
1900 #endif
1901     }
1902     PetscCall(PetscLogEventEnd(MAT_H2Opus_LR,A,0,0,0));
1903     a->orthogonal = PETSC_FALSE;
1904   }
1905   PetscFunctionReturn(0);
1906 }
1907 #endif
1908